cutSentences.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import re
  2. def cut_sentences_v1(sent):
  3. """
  4. the first rank of sentence cut
  5. """
  6. sent = re.sub('([。!?\?])([^”’])', r"\1\n\2", sent) # 单字符断句符
  7. sent = re.sub('(\.{6})([^”’])', r"\1\n\2", sent) # 英文省略号
  8. sent = re.sub('(\…{2})([^”’])', r"\1\n\2", sent) # 中文省略号
  9. sent = re.sub('([。!?\?][”’])([^,。!?\?])', r"\1\n\2", sent)
  10. # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后
  11. return sent.split("\n")
  12. def cut_sentences_v2(sent):
  13. """
  14. the second rank of spilt sentence, split ';' | ';'
  15. """
  16. sent = re.sub('([;;])([^”’])', r"\1\n\2", sent)
  17. return sent.split("\n")
  18. def cut_sent_for_bert(text, max_seq_len):
  19. # 将句子分句,细粒度分句后再重新合并
  20. sentences = []
  21. # 细粒度划分
  22. sentences_v1 = cut_sentences_v1(text)
  23. print("sentences_v1=", sentences_v1)
  24. for sent_v1 in sentences_v1:
  25. if len(sent_v1) > max_seq_len - 2:
  26. sentences_v2 = cut_sentences_v2(sent_v1)
  27. sentences.extend(sentences_v2)
  28. else:
  29. sentences.append(sent_v1)
  30. assert ''.join(sentences) == text
  31. # 合并
  32. merged_sentences = []
  33. start_index_ = 0
  34. while start_index_ < len(sentences):
  35. tmp_text = sentences[start_index_]
  36. end_index_ = start_index_ + 1
  37. # 针对于bert模型,注意这里最大长度要减去2
  38. while end_index_ < len(sentences) and \
  39. len(tmp_text) + len(sentences[end_index_]) <= max_seq_len - 2:
  40. tmp_text += sentences[end_index_]
  41. end_index_ += 1
  42. start_index_ = end_index_
  43. merged_sentences.append(tmp_text)
  44. return merged_sentences
  45. def refactor_labels(sent, labels, start_index):
  46. """
  47. 分句后需要重构 labels 的 offset
  48. :param sent: 切分并重新合并后的句子
  49. :param labels: 原始文档级的 labels
  50. :param start_index: 该句子在文档中的起始 offset
  51. :return (type, entity, offset)
  52. """
  53. new_labels = []
  54. end_index = start_index + len(sent)
  55. # _label: TI, 实体类别, 实体起始位置, 实体结束位置, 实体名)
  56. for _label in labels:
  57. if start_index <= _label[2] <= _label[3] <= end_index:
  58. new_offset = _label[2] - start_index
  59. assert sent[new_offset: new_offset + len(_label[-1])] == _label[-1]
  60. new_labels.append((_label[1], _label[-1], new_offset))
  61. # label 被截断的情况
  62. elif _label[2] < end_index < _label[3]:
  63. raise RuntimeError(f'{sent}, {_label}')
  64. return new_labels
  65. if __name__ == '__main__':
  66. raw_examples = [{
  67. "text": "深圳市沙头角保税区今后五年将充分发挥保税区的区位优势和政策优势,以高新技术产业为先导,积极调整产品结构,实施以转口贸易和仓储业为辅助的经营战略。把沙头角保税区建成按国际惯例运作、国内领先的特殊综合经济区域,使其成为该市外向型经济的快速增长点。",
  68. "labels": [
  69. [
  70. "T0",
  71. "GPE",
  72. 0,
  73. 3,
  74. "深圳市"
  75. ],
  76. [
  77. "T1",
  78. "GPE",
  79. 3,
  80. 6,
  81. "沙头角"
  82. ],
  83. [
  84. "T2",
  85. "LOC",
  86. 6,
  87. 9,
  88. "保税区"
  89. ],
  90. [
  91. "T3",
  92. "LOC",
  93. 18,
  94. 21,
  95. "保税区"
  96. ],
  97. [
  98. "T4",
  99. "GPE",
  100. 73,
  101. 76,
  102. "沙头角"
  103. ],
  104. [
  105. "T5",
  106. "LOC",
  107. 76,
  108. 79,
  109. "保税区"
  110. ]
  111. ]
  112. }]
  113. for i, item in enumerate(raw_examples):
  114. text = item['text']
  115. print(text[:90])
  116. sentences = cut_sent_for_bert(text, 90)
  117. start_index = 0
  118. for sent in sentences:
  119. labels = refactor_labels(sent, item['labels'], start_index)
  120. start_index += len(sent)
  121. print(sent)
  122. print(labels)