nlp_utils.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. from transformers import pipeline
  2. import re
  3. # 加载预训练的BERT分类器
  4. classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
  5. def classify_activity_type(description):
  6. """根据描述文本对活动进行类型分类"""
  7. result = classifier(description)[0]
  8. label = result['label']
  9. # 简化标签映射到具体活动类型
  10. if 'positive' in label.lower():
  11. return "学术讲座"
  12. elif 'negative' in label.lower():
  13. return "文体比赛"
  14. else:
  15. return "其他"
  16. ner_model = pipeline("ner", model="dslim/bert-base-NER")
  17. def extract_information(text):
  18. """从文本中抽取姓名和身份证号"""
  19. result = ner_model(text)
  20. name = None
  21. id_number = None
  22. for entity in result:
  23. if entity['entity'] == 'B-PER': # 姓名
  24. if name is None:
  25. name = entity['word'].replace('#', '')
  26. elif entity['entity'] == 'I-PER':
  27. name += entity['word'].replace('#', '')
  28. # 使用正则表达式匹配身份证号
  29. match = re.search(r'\d{17}[\dxX]|\d{15}', text)
  30. if match:
  31. id_number = match.group(0)
  32. return {'name': name, 'id_number': id_number}