zhangyafeii-lstm-crf
/
helper.py

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 14 11:33:15 2019

@author: Zhang Yafei
"""
import pandas as pd
import pickle
import tensorflow as tf
from model import BiLSTM_CRF
import re


def write2word2id(comment=None):
    """
    得到语料中文的词向量坐标
    ：return dict 返回值为字典
    """
    words = set(''.join(comment))
    r1 = u'[a-zA-Z0-9’!"#　$	%&\'()*+,-./:;<=>?@，。·α?★、…【】《》？“”‘’！[\\]^_`{|}~]+'#用户也可以在此进行自定义过滤字符
    words = re.sub(r1, '', ''.join(words))
    word2id = {}
    for num, word in enumerate(words):
        word2id[word] = num
    word2id = sorted(word2id.items(), key=lambda d:d[1])
    from collections import OrderedDict
    word2id = OrderedDict(word2id)
    with open('word2id.pkl',mode='wb') as f:
        pickle.dump(word2id, f)


def read_from_word2id():
    """
    读取word2id字典
    :return:
    """
    with open('word2id.pkl',mode='rb') as f:
        word2id = pickle.load(f)
    return word2id


def transfer_corpus(row):
    """
    转换语料, 将列表数据去除英文字母特殊字符
    :param comment:
    :return:
    """
    line = row.评论
    r1 = u'[a-zA-Z0-9’!"#　$	%&\'()*,.+-/:;<=>?@，。·α?★、…【】《》？“”‘’！[\\]^_`{|}/\n/\t~]+'#用户也可以在此进行自定义过滤字符
    line = re.sub(r1, '', line).strip()
    return line


def make_corpus(data=None,file_path=None):
    """
    将标注数据转换为模型需要的数据格式
    :param data: list
    :param file_path: save file_path
    :return:
    """
    words_list = []
    for item in data:
        line = item.split('/')
        for num, word in enumerate(line):
            if num == 0:
                continue

            if word.startswith('PB'):
                for n, w in enumerate(line[num-1]):
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    if n == 0:
                        corpus_word = (w,'B-P')
                    else:
                        corpus_word = (w,'I-P')
                    words_list.append(corpus_word)

            elif word.startswith('PI'):
                for w in line[num-1]:
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    corpus_word = (w,'I-P')
                    words_list.append(corpus_word)

            elif word.startswith('SB'):
                for n, w in enumerate(line[num-1]):
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    if n == 0:
                        corpus_word = (w,'B-S')
                    else:
                        corpus_word = (w,'I-S')
                    words_list.append(corpus_word)

            elif word.startswith('SI'):
                for w in line[num-1]:
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    corpus_word = (w,'I-P')
                    words_list.append(corpus_word)
            elif num == len(line) - 1:
                for w in line[num]:
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    corpus_word = (w,'O')
                    words_list.append(corpus_word)
            else:
                for w in line[num-1]:
                    w = w.strip()
                    if w in 'BPSIO' or w == '':
                        continue
                    corpus_word = (w,'O')
                    words_list.append(corpus_word)

        words_list.append('\n')

    with open(file_path, 'w', encoding='utf-8') as f:
        for item in words_list:
            if item != '\n' and type(item) == tuple:
                f.write(item[0]+' '+item[1]+'\n')
            elif item == '\n':
                f.write(item)


def parse_id(row):
    """
    利用正则从url中提取药品ID
    :param row:
    :return:
    """
    ret = re.match('http://ypk.39.net/(.*?)/manual', row.url)
    drug_id = ret.group(1)
    return drug_id


if __name__ == '__main__':
    ''' 1.读取标注的评论数据'''
    # data = pd.read_csv('label.csv',encoding='utf-8')

    ''' 2.将评论所有单词转化为词向量 '''
    # comment = data['评论'].tolist()
    # write2word2id(comment)
    # word2id = read_from_word2id()

    ''' 3.转化为符合模型要求的语料的格式,并划分训练集，测试集 '''
    # result = data['分词'].tolist()
    # with open('segmentation.txt', 'w') as f:
    #     f.write('\n'.join(result))
    # train_data = result[:1280]
    # test_data = result[1280:]
    # make_corpus(train_data, file_path='train_data')
    # make_corpus(test_data, file_path='test_data')

    ''' 4.将所有评论数据使用训练好的模型提取数据: main.py中实现 '''
    ''' 5.把训练的模型选择的症状提取出来 '''
    comment_data = pd.read_csv('data/first50_comment.csv', encoding='utf-8')
    result_data = pd.read_csv('data/result.csv', encoding='utf-8')
    drug_adverse = pd.read_excel('data/drug_dict.xlsx')
    comment_data['symptom'] = result_data['symptom']
    comment_data['postion'] = result_data['position']
    drug_ids = drug_adverse.apply(parse_id, axis=1)
    drug_adverse['药品ID'] = drug_ids
    columns = ['通用名称', '商品名称', '药品ID', '适应症', '不良反应', 'url']
    drug_adverse.to_csv('data/first50_drug_dict.csv', index=False, columns=columns, encoding='utf_8_sig')
    first50_data = pd.read_csv('data/first50_drug_dict.csv', encoding='utf-8')
    comment_data.loc[:,'symptom'] = comment_data['symptom'].str.strip('[')
    comment_data.loc[:,'symptom'] = comment_data['symptom'].str.strip(']')
    comment_data.loc[:,'postion'] = comment_data['postion'].str.strip('[')
    comment_data.loc[:,'postion'] = comment_data['postion'].str.strip(']')
    first50_data['symptom'] = None
    first50_data['position'] = None
    for id in first50_data.药品ID.unique():
        symptom = set(filter(lambda x: x, comment_data[comment_data.药品ID == id].symptom))
        position = set(filter(lambda x: x, comment_data[comment_data.药品ID == id].postion))
        symptom = ','.join(symptom)
        position = ','.join(position)
        first50_data.loc[first50_data.药品ID==id,'symptom'] = symptom
        first50_data.loc[first50_data.药品ID==id,'position'] = position