#!/usr/bin/env python
# -*- coding:utf-8 -*-
“””
@Author :geekzw
@Contact :1223242863@qq.com
@File :AC_ner.py
@Time :2020/3/5 12:56 AM
@Software :Pycharm
@Copyright (c) 2020,All Rights Reserved.
“””
import time
class node(object):
def __init__(self):
self.next = {} # 相当于指针,指向树节点的下一层节点
self.fail = None # 失配指针,这个是AC自动机的关键
self.isWord = False # 标记,用来判断是否是一个标签的结尾
self.word = “” # 用来储存标签
class ac_automation(object):
def __init__(self, user_dict_path):
self.root = node()
self.user_dict_path = user_dict_path
def add(self, word):
temp_root = self.root
for char in word:
if char not in temp_root.next:
temp_root.next[char] = node()
temp_root = temp_root.next[char]
temp_root.isWord = True
temp_root.word = word
# 添加文件中的关键词
def add_keyword(self):
with open(self.user_dict_path, “r”, encoding=”utf-8″) as file:
for line in file:
self.add(line.strip())
def make_fail(self):
temp_que = []
temp_que.append(self.root)
while len(temp_que) != 0:
temp = temp_que.pop(0)
p = None
for key,value in temp.next.item():
if temp == self.root:
temp.next[key].fail = self.root
else:
p = temp.fail
while p is not None:
if key in p.next:
temp.next[key].fail = p.fail
break
p = p.fail
if p is None:
temp.next[key].fail = self.root
temp_que.append(temp.next[key])
def search(self, content):
p = self.root
result = set()
index = 0
while index < len(content) - 1:
currentposition = index
while currentposition < len(content):
word = content[currentposition]
while word in p.next == False and p != self.root:
p = p.fail
if word in p.next:
p = p.next[word]
else:
p = self.root
if p.isWord:
end_index = currentposition + 1
result.add((p.word, end_index - len(p.word), end_index))
break
currentposition += 1
p = self.root
index += 1
return result
if name == "__main__":
ac = ac_automation(user_dict_path="../../funNLP/organization_dict.txt")
ac.add_keyword() # 添加关键词到AC自动机
while True:
query = input("\nINPUT: ")
ss = time.time()
res = ac.search(query)
print("TIME: {0} ms!".format(round(1000 * (time.time() - ss), 3)))
print("OUTPUT:", res)
输出:
说明:对比发现,自己改写的纯Python版本,存在优化空间,降低耗时成本!