Commit a65db92a by TJL233

add files 5.2

parent a1bff93d
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
{
"word_embedding_dimension": 768,
"pooling_mode_cls_token": false,
"pooling_mode_mean_tokens": true,
"pooling_mode_max_tokens": false,
"pooling_mode_mean_sqrt_len_tokens": false
}
\ No newline at end of file
---
---
pipeline_tag: sentence-similarity
tags:
- sentence-transformers
- feature-extraction
- sentence-similarity
- transformers
---
# {MODEL_NAME}
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
<!--- Describe your model here -->
## Usage (Sentence-Transformers)
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
```
pip install -U sentence-transformers
```
Then you can use the model like this:
```python
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('{MODEL_NAME}')
embeddings = model.encode(sentences)
print(embeddings)
```
## Usage (HuggingFace Transformers)
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
```python
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
model = AutoModel.from_pretrained('{MODEL_NAME}')
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
print("Sentence embeddings:")
print(sentence_embeddings)
```
## Evaluation Results
<!--- Describe how your model was evaluated -->
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
## Training
The model was trained with the parameters:
**DataLoader**:
`torch.utils.data.dataloader.DataLoader` of length 3722 with parameters:
```
{'batch_size': 64, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
```
**Loss**:
`sentence_transformers.losses.CosineSimilarityLoss.CosineSimilarityLoss`
Parameters of the fit()-Method:
```
{
"epochs": 6,
"evaluation_steps": 1000,
"evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator",
"max_grad_norm": 1,
"optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
"optimizer_params": {
"lr": 2e-05
},
"scheduler": "WarmupLinear",
"steps_per_epoch": null,
"warmup_steps": 2234,
"weight_decay": 0.01
}
```
## Full Model Architecture
```
SentenceTransformer(
(0): Transformer({'max_seq_length': 30, 'do_lower_case': False}) with Transformer model: BertModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
```
## Citing & Authors
<!--- Describe where people can find more information -->
\ No newline at end of file
{
{
"_name_or_path": "D:\\anoconda\\envs\\opennre\\pretrain\\bert-base-chinese",
"architectures": [
"BertModel"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"directionality": "bidi",
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.19.4",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 21128
}
{
{
"__version__": {
"sentence_transformers": "2.2.2",
"transformers": "4.19.4",
"pytorch": "1.12.1"
}
}
\ No newline at end of file
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,1000,0.8437845648676535,0.6035401185558359,0.8504096068747292,0.6029035420965714,0.8488711604488226,0.6026591024286487,0.8445383623667045,0.6035690090832289
0,2000,0.8935672017006747,0.6089277058133216,0.8993714525229487,0.6089286089095641,0.8982502749171174,0.6087811447310354,0.8952999954129647,0.6092893492793621
0,3000,0.9111150974349347,0.6102740797975597,0.9121018987116367,0.6098969953760157,0.9111916868136933,0.6093799089754433,0.9142359219443127,0.6115740437090497
0,-1,0.9205942236069623,0.611609596968196,0.91967000874278,0.6108586173137909,0.9188931419032553,0.6104703022717178,0.9221861883085548,0.6118700822896079
1,1000,0.9262847408643023,0.6110517039158152,0.9256525043554035,0.6107060926335184,0.9249270660602564,0.6105567049552368,0.927747130106175,0.6119398447434364
1,2000,0.9305485000884564,0.6108210514978757,0.9282247143547125,0.6104685724060088,0.9275207989706534,0.6101088388036945,0.9320225187948131,0.6114355902862035
1,3000,0.937293923227913,0.6123948376417697,0.932594167268225,0.6116891054612364,0.9319296166923247,0.6112649022701788,0.9387973108145993,0.6133410449821489
1,-1,0.9397338568421564,0.6133559302951734,0.934217258815834,0.6125543199204074,0.9335433879265738,0.6121143543400612,0.9412434619076796,0.6142257655640119
2,1000,0.9398135936858681,0.6128548515495744,0.9335101847037173,0.6119080490526673,0.9329285131406844,0.6115995658086126,0.9406354498810451,0.6137532037069394
2,2000,0.9405905381901346,0.6132498924033661,0.9347816002418357,0.612290103029227,0.934188006641766,0.6120640463689874,0.9408684621376949,0.6135873952504144
2,3000,0.9422908546839772,0.6130001082307834,0.936151083098645,0.6122333274489373,0.9355485606767819,0.6119347715445361,0.9428214146487125,0.6137387709855698
2,-1,0.9454001461717783,0.6127737417303069,0.9389878183674979,0.6121081449018061,0.9384803985993851,0.6120056054740768,0.9458454038386815,0.6133566395501411
3,1000,0.9447919756345161,0.6134061478976345,0.9384394859295686,0.6124496891596876,0.9379099350158963,0.6124278206274265,0.9442121306272767,0.613203379039636
3,2000,0.9439834324722985,0.6125950367749929,0.9374559935639454,0.6116498350430999,0.9369188801668993,0.61141128207572,0.944827117891961,0.6135478795626684
3,3000,0.9453801376875116,0.6128384049683848,0.9380745231063867,0.6117609334201393,0.9376015409902312,0.6118088273439997,0.9459593519702346,0.6135829027798863
3,-1,0.945727118096799,0.6126144654437906,0.9386445544683326,0.6116246616890176,0.9381795211560322,0.6114432199455765,0.9458788053203874,0.6132647632878151
4,1000,0.9455493204368942,0.6123298258640822,0.9384364446476412,0.6114608800150806,0.9379093145783454,0.6111532358753103,0.9456160146254234,0.6126410451481927
[
[
{
"idx": 0,
"name": "0",
"path": "",
"type": "sentence_transformers.models.Transformer"
},
{
"idx": 1,
"name": "1",
"path": "1_Pooling",
"type": "sentence_transformers.models.Pooling"
}
]
\ No newline at end of file
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
\ No newline at end of file
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "D:\\anoconda\\envs\\opennre\\pretrain\\bert-base-chinese", "tokenizer_class": "BertTokenizer"}
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "D:\\anoconda\\envs\\opennre\\pretrain\\bert-base-chinese", "tokenizer_class": "BertTokenizer"}
\ No newline at end of file
"""
"""
This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
that can be compared using cosine-similarity to measure the similarity.
Usage:
python training_nli.py
OR
python training_nli.py pretrained_transformer_model_name
"""
import json
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
#### /print debug information to stdout
#Check if dataset exsist. If not, download and extract it
# sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'
#
# if not os.path.exists(sts_dataset_path):
# util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = r'D:\anoconda\envs\opennre\pretrain\bert-base-chinese'
# Read the dataset
train_batch_size = 64
num_epochs = 6
# model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = r'E:\科大工作\知识图谱\基于BERT模型得自然语言处理实战\KgCLUE-main\datasets\自制算法\实体相似度'+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name,max_seq_length=30)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_mean_tokens=True,
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_samples = []
dev_samples = []
test_samples = []
error_count=0
with open('实体相似_train.txt', 'rt', encoding='utf8') as fIn:
lines=fIn.readlines()
for line in lines:
try:
line=line.strip()
inp_example = InputExample(texts=[line.split('\t')[0], line.split('\t')[1]], label=float(line.split('\t')[2]))
train_samples.append(inp_example)
except:
error_count+=1
print(error_count)
with open('实体相似_val.txt', 'rt', encoding='utf8') as fIn:
lines=fIn.readlines()
for line in lines:
try:
line=line.strip()
inp_example = InputExample(texts=[line.split('\t')[0], line.split('\t')[1]], label=float(line.split('\t')[2]))
dev_samples.append(inp_example)
except:
error_count += 1
print(error_count)
with open('实体相似_val.txt', 'rt', encoding='utf8') as fIn:
lines=fIn.readlines()
for line in lines:
line=line.strip()
inp_example = InputExample(texts=[line.split('\t')[0], line.split('\t')[1]], label=float(line.split('\t')[2]))
test_samples.append(inp_example)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)#余弦相似度函数
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data1 for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)
import random
import random
import re
#encoding=utf8
import os
import sys
import jieba
import pickle
from gensim.summarization import bm25
from tqdm import tqdm
class BM25Retrieval(object):
"""docstring for BM25Retrieval"""
def __init__(self, entities):
super(BM25Retrieval, self).__init__()
self.kb_entitys = entities
self.bm25Model = bm25.BM25([list(i) for i in self.kb_entitys])#建立一个bm25的模型,备选词为这些词典
def retrieval(self,query,top_k):#返回对query相似度的排序
scores = self.bm25Model.get_scores(query)
match_score = {e:s for e,s in zip(self.kb_entitys,scores)}
match_score = sorted(match_score.items(),key=lambda x:x[1],reverse=True)
return [i[0] for i in match_score[:top_k]]
def LCS( word1: str, word2: str) -> int:
m = len(word1)
n = len(word2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
# dp[i][j]代表word1以i结尾,word2以j结尾,的最大公共子串的长度
max_len = 0
row = 0
col = 0
for i in range(1, m + 1):
for j in range(1, n + 1):
if word1[i - 1] == word2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
if max_len < dp[i][j]:
max_len = dp[i][j]
row = i
col = j
max_str = ""
i = row
j = col
while i > 0 and j > 0:
if dp[i][j] == 0:
break
i -= 1
j -= 1
max_str += word1[i]
lcstr = max_str[::-1]
# 回溯的得到的最长公共子串
return lcstr
neg_sample=[]
pos_sample=[]
all_sample=[]
houzhui=[]
ner_entity=[]
with open(r'E:\科大工作\知识图谱\基于BERT模型得自然语言处理实战\KgCLUE-main\datasets\train.json','r',encoding='utf-8') as fp:
lines=fp.readlines()
lines_dict=[eval(i) for i in lines]
all_question=[k["question"] for k in lines_dict]
all_sample=[LCS(j['answer'].split('|||')[0].strip(),j["question"]) for j in lines_dict]
bm25Model = BM25Retrieval(all_sample)
for line in tqdm(lines):
line_dict=eval(line)
answer=line_dict['answer'].split('|||')[0]
answer=answer.strip()
if answer not in line_dict["question"]:
LCS_str=LCS(answer,line_dict["question"])
ner_entity.append(LCS_str)
pos_sample.append([LCS_str,answer])
retr_3=bm25Model.retrieval(LCS_str,8)
while LCS_str in retr_3:
retr_3.remove(LCS_str)
while answer in retr_3:
retr_3.remove(answer)
neg_sample.append([retr_3[0],answer])
neg_sample.append([retr_3[1], answer])
neg_sample.append([retr_3[2], answer])
houzhui.append(answer.replace(LCS_str,''))
else:
retr_3=bm25Model.retrieval(answer, 8)
while answer in retr_3:
retr_3.remove(answer)
ner_entity.append(answer)
neg_sample.append([retr_3[0],answer])
neg_sample.append([retr_3[1],answer])
neg_sample.append([retr_3[2], answer])
pos_sample.append([answer+random.choice(houzhui),answer])
error_count=0
with open(r'../../../Bert_CRF-main/data/A&Q/数据_train.txt', 'w', encoding='utf-8') as fd:
for num,answer_str in tqdm(enumerate(all_question)):
try:
ner_scope=re.search(ner_entity[num],answer_str).span()
except:
error_count+=1
print(ner_entity[num],answer_str)
print(error_count)
continue
for index,str1 in enumerate(answer_str):
if index==ner_scope[0] :
fd.write(str1+'\t'+'B-entity'+'\n')
elif index>ner_scope[0] and index<=ner_scope[1]:
fd.write(str1+'\t'+'I-entity'+'\n')
else:
fd.write(str1+'\t'+'O'+'\n')
fd.write('\n')
with open(r'实体相似_train.txt', 'w', encoding='utf-8') as fg:
for num1,pos in enumerate(pos_sample):
fg.write(pos[0]+'\t'+pos[1]+'\t'+'1'+'\n')
fg.write(neg_sample[3*num1][0] + '\t' + neg_sample[3*num1][1] + '\t' + '0' + '\n')
fg.write(neg_sample[3*num1+1][0] + '\t' + neg_sample[3*num1+1][1] + '\t' + '0' + '\n')
fg.write(neg_sample[3*num1+2][0] + '\t' + neg_sample[3*num1+2][1] + '\t' + '0' + '\n')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment