return_result.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import sys
  2. sys.path.append("./bert_bilstm_crf_ner")
  3. sys.path.append("./bert_re")
  4. import bert_bilstm_crf_ner.config as ner_config
  5. # import bert_bilstm_crf_ner.bert_ner_model as ner_model
  6. import bert_bilstm_crf_ner.main as ner_main
  7. import bert_re.main as re_main
  8. import bert_re.bert_config as re_config
  9. import os
  10. import re
  11. import logging
  12. from transformers import BertTokenizer
  13. from bert_bilstm_crf_ner import bert_ner_model as ner_model
  14. import bert_re.models as re_model
  15. logger = logging.getLogger(__name__)
  16. results = []
  17. def get_ner_result(raw_text):
  18. # 命名实体识别相关
  19. model_name = 'bert_crf'
  20. ner_args = ner_config.Args().get_parser()
  21. ner_args.bert_dir = './model_hub/chinese-roberta-wwm-ext/'
  22. ner_args.gpu_ids = "-1"
  23. ner_args.use_lstm = 'False'
  24. ner_args.use_crf = 'True'
  25. ner_args.num_tags = 5
  26. ner_args.max_seq_len = 512
  27. ner_args.num_layers = 1
  28. ner_args.lstm_hidden = 128
  29. nerlabel2id = {}
  30. id2nerlabel = {}
  31. with open('./data/dgre/mid_data/ner_labels.txt','r') as fp:
  32. ner_labels = fp.read().strip().split('\n')
  33. for i,j in enumerate(ner_labels):
  34. nerlabel2id[j] = i
  35. id2nerlabel[i] = j
  36. logger.info(id2nerlabel)
  37. bertForNer = ner_main.BertForNer(ner_args, None, None, None, id2nerlabel)
  38. model_path = './bert_bilstm_crf_ner/checkpoints/{}/model.pt'.format(model_name)
  39. pred_entities = bertForNer.predict(raw_text, model_path)
  40. return pred_entities
  41. def get_re_result(entities, raw_text):
  42. # 首先先区分是主体还是客体
  43. subjects = []
  44. objects = []
  45. results = []
  46. for info in entities:
  47. print(info)
  48. if info[2] == 'subject':
  49. subjects.append((info[0],info[1],info[1]+len(info[0])))
  50. elif info[2] == 'object':
  51. objects.append((info[0],info[1],info[1]+len(info[0])))
  52. print(subjects)
  53. print(objects)
  54. re_args = re_config.Args().get_parser()
  55. re_args.bert_dir = './model_hub/chinese-roberta-wwm-ext/'
  56. re_args.gpu_ids = "-1"
  57. re_args.num_tags = 5
  58. re_args.max_seq_len = 512
  59. trainer = re_main.Trainer(re_args, None, None, None)
  60. re_args.output_dir = './bert_re/checkpoints/'
  61. tokenizer = BertTokenizer.from_pretrained(re_args.bert_dir)
  62. process_data = transforme_re_data(subjects, objects, raw_text)
  63. label2id = {}
  64. id2label = {}
  65. with open('./data/dgre/re_mid_data/rels.txt','r',encoding='utf-8') as fp:
  66. labels = fp.read().strip().split('\n')
  67. for i,j in enumerate(labels):
  68. label2id[j] = i
  69. id2label[i] = j
  70. for data in process_data:
  71. relation = trainer.predict(tokenizer, data[0], id2label, re_args, data[1])
  72. result = {}
  73. print("==========================")
  74. print(raw_text)
  75. print("主体:", data[2][0])
  76. print("客体:", data[2][1])
  77. print("关系:", "".join(relation))
  78. result["start"] = data[2][0][0]
  79. result["end"] = data[2][1][0]
  80. result["relation"] = relation
  81. results.append(result)
  82. return results
  83. def transforme_re_data(subjects, objects, text):
  84. # 遍历每一个主体和客体
  85. tmp_text = text
  86. process_data = []
  87. for sub in subjects:
  88. for obj in objects:
  89. if obj[0] in sub[0]:
  90. text = text[:sub[1]] + '&'*len(sub[0]) + text[sub[2]:]
  91. text = text[:obj[1]] + '%'*len(obj[0]) + text[obj[2]:]
  92. text = re.sub('&'*len(sub[0]),'#'+'&'*len(sub[0])+'#', text)
  93. text = re.sub('%'*len(obj[0]),'$'+'%'*len(obj[0])+'$', text)
  94. else:
  95. text = text[:obj[1]] + '%'*len(obj[0]) + text[obj[2]:]
  96. text = text[:sub[1]] + '&'*len(sub[0]) + text[sub[2]:]
  97. text = re.sub('%'*len(obj[0]),'$'+'%'*len(obj[0])+'$', text)
  98. text = re.sub('&'*len(sub[0]),'#'+'&'*len(sub[0])+'#', text)
  99. try:
  100. sub_re = re.search('&'*len(sub[0]), text)
  101. sub_re_span = sub_re.span()
  102. sub_re_start = sub_re_span[0]
  103. sub_re_end = sub_re_span[1]+1
  104. obj_res = re.search('%'*len(obj[0]), text)
  105. obj_re_span = obj_res.span()
  106. obj_re_start = obj_re_span[0]
  107. obj_re_end = obj_re_span[1]+1
  108. text = re.sub('&'*len(sub[0]),sub[0],text)
  109. text = re.sub('%'*len(obj[0]),obj[0],text)
  110. except Exception as e:
  111. print(e)
  112. continue
  113. process_data.append((text,[sub[1],sub[2],obj[1],obj[2]],(sub,obj)))
  114. # 恢复text
  115. text = tmp_text
  116. return process_data
  117. def extract(raw_text):
  118. results = []
  119. entities = get_ner_result(raw_text)
  120. result = get_re_result(entities, raw_text)
  121. results.append(result)
  122. if __name__ == '__main__':
  123. # raw_texts = [
  124. # '明早起飞》是由明太鱼作词,满江作曲,戴娆演唱的一首歌曲',
  125. # '古董相机收藏与鉴赏》是由高继生、高峻岭编著,浙江科学技术出版社出版的一本书籍',
  126. # '谢顺光,男,祖籍江西都昌,出生于景德镇陶瓷世家',
  127. # ]
  128. raw_texts = [
  129. '故障现象:转向时有“咯噔”声原因分析:转向机与转向轴处缺油解决措施:向此处重新覆盖一层润滑脂后,故障消失',
  130. '1045号汽车故障报告故障现象打开点火开关,操作左前电动座椅开关,座椅6个方向均不动作故障原因六向电动座椅线束磨破搭铁修复方法包扎磨破线束,从新固定。',
  131. ]
  132. results = []
  133. for raw_text in raw_texts:
  134. entities = get_ner_result(raw_text)
  135. result = get_re_result(entities, raw_text)
  136. results.append(result)
  137. print(results)