1 В избранное 0 Ответвления 0

OSCHINA-MIRROR/zhangyafeii-lstm-crf

В этом репозитории не указан файл с открытой лицензией (LICENSE). При использовании обратитесь к конкретному описанию проекта и его зависимостям в коде.
Клонировать/Скачать
helper.py 6.4 КБ
Копировать Редактировать Web IDE Исходные данные Просмотреть построчно История
张亚飞 Отправлено 19.02.2019 07:56 d2557b3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 14 11:33:15 2019
@author: Zhang Yafei
"""
import pandas as pd
import pickle
import tensorflow as tf
from model import BiLSTM_CRF
import re
def write2word2id(comment=None):
"""
得到语料中文的词向量坐标
:return dict 返回值为字典
"""
words = set(''.join(comment))
r1 = u'[a-zA-Z0-9’!"# $ %&\'()*+,-./:;<=>?@,。·α?★、…【】《》?“”‘’![\\]^_`{|}~]+'#用户也可以在此进行自定义过滤字符
words = re.sub(r1, '', ''.join(words))
word2id = {}
for num, word in enumerate(words):
word2id[word] = num
word2id = sorted(word2id.items(), key=lambda d:d[1])
from collections import OrderedDict
word2id = OrderedDict(word2id)
with open('word2id.pkl',mode='wb') as f:
pickle.dump(word2id, f)
def read_from_word2id():
"""
读取word2id字典
:return:
"""
with open('word2id.pkl',mode='rb') as f:
word2id = pickle.load(f)
return word2id
def transfer_corpus(row):
"""
转换语料, 将列表数据去除英文字母特殊字符
:param comment:
:return:
"""
line = row.评论
r1 = u'[a-zA-Z0-9’!"# $ %&\'()*,.+-/:;<=>?@,。·α?★、…【】《》?“”‘’![\\]^_`{|}/\n/\t~]+'#用户也可以在此进行自定义过滤字符
line = re.sub(r1, '', line).strip()
return line
def make_corpus(data=None,file_path=None):
"""
将标注数据转换为模型需要的数据格式
:param data: list
:param file_path: save file_path
:return:
"""
words_list = []
for item in data:
line = item.split('/')
for num, word in enumerate(line):
if num == 0:
continue
if word.startswith('PB'):
for n, w in enumerate(line[num-1]):
w = w.strip()
if w in 'BPSIO' or w == '':
continue
if n == 0:
corpus_word = (w,'B-P')
else:
corpus_word = (w,'I-P')
words_list.append(corpus_word)
elif word.startswith('PI'):
for w in line[num-1]:
w = w.strip()
if w in 'BPSIO' or w == '':
continue
corpus_word = (w,'I-P')
words_list.append(corpus_word)
elif word.startswith('SB'):
for n, w in enumerate(line[num-1]):
w = w.strip()
if w in 'BPSIO' or w == '':
continue
if n == 0:
corpus_word = (w,'B-S')
else:
corpus_word = (w,'I-S')
words_list.append(corpus_word)
elif word.startswith('SI'):
for w in line[num-1]:
w = w.strip()
if w in 'BPSIO' or w == '':
continue
corpus_word = (w,'I-P')
words_list.append(corpus_word)
elif num == len(line) - 1:
for w in line[num]:
w = w.strip()
if w in 'BPSIO' or w == '':
continue
corpus_word = (w,'O')
words_list.append(corpus_word)
else:
for w in line[num-1]:
w = w.strip()
if w in 'BPSIO' or w == '':
continue
corpus_word = (w,'O')
words_list.append(corpus_word)
words_list.append('\n')
with open(file_path, 'w', encoding='utf-8') as f:
for item in words_list:
if item != '\n' and type(item) == tuple:
f.write(item[0]+' '+item[1]+'\n')
elif item == '\n':
f.write(item)
def parse_id(row):
"""
利用正则从url中提取药品ID
:param row:
:return:
"""
ret = re.match('http://ypk.39.net/(.*?)/manual', row.url)
drug_id = ret.group(1)
return drug_id
if __name__ == '__main__':
''' 1.读取标注的评论数据'''
# data = pd.read_csv('label.csv',encoding='utf-8')
''' 2.将评论所有单词转化为词向量 '''
# comment = data['评论'].tolist()
# write2word2id(comment)
# word2id = read_from_word2id()
''' 3.转化为符合模型要求的语料的格式,并划分训练集,测试集 '''
# result = data['分词'].tolist()
# with open('segmentation.txt', 'w') as f:
# f.write('\n'.join(result))
# train_data = result[:1280]
# test_data = result[1280:]
# make_corpus(train_data, file_path='train_data')
# make_corpus(test_data, file_path='test_data')
''' 4.将所有评论数据使用训练好的模型提取数据: main.py中实现 '''
''' 5.把训练的模型选择的症状提取出来 '''
comment_data = pd.read_csv('data/first50_comment.csv', encoding='utf-8')
result_data = pd.read_csv('data/result.csv', encoding='utf-8')
drug_adverse = pd.read_excel('data/drug_dict.xlsx')
comment_data['symptom'] = result_data['symptom']
comment_data['postion'] = result_data['position']
drug_ids = drug_adverse.apply(parse_id, axis=1)
drug_adverse['药品ID'] = drug_ids
columns = ['通用名称', '商品名称', '药品ID', '适应症', '不良反应', 'url']
drug_adverse.to_csv('data/first50_drug_dict.csv', index=False, columns=columns, encoding='utf_8_sig')
first50_data = pd.read_csv('data/first50_drug_dict.csv', encoding='utf-8')
comment_data.loc[:,'symptom'] = comment_data['symptom'].str.strip('[')
comment_data.loc[:,'symptom'] = comment_data['symptom'].str.strip(']')
comment_data.loc[:,'postion'] = comment_data['postion'].str.strip('[')
comment_data.loc[:,'postion'] = comment_data['postion'].str.strip(']')
first50_data['symptom'] = None
first50_data['position'] = None
for id in first50_data.药品ID.unique():
symptom = set(filter(lambda x: x, comment_data[comment_data.药品ID == id].symptom))
position = set(filter(lambda x: x, comment_data[comment_data.药品ID == id].postion))
symptom = ','.join(symptom)
position = ','.join(position)
first50_data.loc[first50_data.药品ID==id,'symptom'] = symptom
first50_data.loc[first50_data.药品ID==id,'position'] = position

Опубликовать ( 0 )

Вы можете оставить комментарий после Вход в систему

1
https://api.gitlife.ru/oschina-mirror/zhangyafeii-lstm-crf.git
git@api.gitlife.ru:oschina-mirror/zhangyafeii-lstm-crf.git
oschina-mirror
zhangyafeii-lstm-crf
zhangyafeii-lstm-crf
master