12345678910111213141516171819202122232425262728293031323334353637383940 |
- from transformers import pipeline
- import re
- # 加载预训练的BERT分类器
- classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
- def classify_activity_type(description):
- """根据描述文本对活动进行类型分类"""
- result = classifier(description)[0]
- label = result['label']
-
- # 简化标签映射到具体活动类型
- if 'positive' in label.lower():
- return "学术讲座"
- elif 'negative' in label.lower():
- return "文体比赛"
- else:
- return "其他"
- ner_model = pipeline("ner", model="dslim/bert-base-NER")
- def extract_information(text):
- """从文本中抽取姓名和身份证号"""
- result = ner_model(text)
- name = None
- id_number = None
- for entity in result:
- if entity['entity'] == 'B-PER': # 姓名
- if name is None:
- name = entity['word'].replace('#', '')
- elif entity['entity'] == 'I-PER':
- name += entity['word'].replace('#', '')
- # 使用正则表达式匹配身份证号
- match = re.search(r'\d{17}[\dxX]|\d{15}', text)
- if match:
- id_number = match.group(0)
- return {'name': name, 'id_number': id_number}
|