1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
class JiebaTokenizer(Tokenizer, Component):
# MESSAGE_ATTRIBUTES = ["text", "intent", "response"]
# provides = ["tokens", "intent_tokens", "response_tokens"]
provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
language_list = ["zh"]
defaults = {
"dictionary_path": None,
# Flag to check whether to split intents
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
} # default don't load custom dictionary
def __init__(self, component_config: Dict[Text, Any] = None) -> None:
"""Construct a new intent classifier using the MITIE framework."""
super(JiebaTokenizer, self).__init__(component_config)
# path to dictionary file or None
self.dictionary_path = self.component_config.get("dictionary_path")
# flag to check whether to split intents
self.intent_tokenization_flag = self.component_config.get(
"intent_tokenization_flag"
)
# symbol to split intents on
self.intent_split_symbol = self.component_config.get("intent_split_symbol")
# load dictionary
if self.dictionary_path is not None:
self.load_custom_dictionary(self.dictionary_path)
# 关于classmethod,staticmethod的详细介绍,参见补充知识
@classmethod
def required_packages(cls) -> List[Text]:
return ["jieba"]
# 加载自定义词典
@staticmethod
def load_custom_dictionary(path: Text) -> None:
"""Load all the custom dictionaries stored in the path.
More information about the dictionaries file format can
be found in the documentation of jieba.
https://github.com/fxsjy/jieba#load-dictionary
"""
import jieba
jieba_userdicts = glob.glob("{}/*".format(path))
for jieba_userdict in jieba_userdicts:
logger.info("Loading Jieba User Dictionary at {}".format(jieba_userdict))
jieba.load_userdict(jieba_userdict)
def train(
self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
) -> None:
# example的类型是Dict
# 针对nlu中的每句话进行分词,将分词得到的列表值设入字典中
for example in training_data.training_examples:
for attribute in MESSAGE_ATTRIBUTES: # MESSAGE_ATTRIBUTES = ["text", "intent", "response"]
if example.get(attribute) is not None:
example.set(
MESSAGE_TOKENS_NAMES[attribute],
self.tokenize(example.get(attribute), attribute),
)
def process(self, message: Message, **kwargs: Any) -> None:
message.set(
MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
self.tokenize(message.text, MESSAGE_TEXT_ATTRIBUTE),
)
def preprocess_text(self, text, attribute):
# 确定对意图是不是需要进行split
if attribute == MESSAGE_INTENT_ATTRIBUTE and self.intent_tokenization_flag:
return " ".join(text.split(self.intent_split_symbol))
else:
return text
def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
import jieba
text = self.preprocess_text(text, attribute)
tokenized = jieba.tokenize(text)
# 创建token列表
# Token的定义很简单,用来存储单词,单词出现的偏置等基本信息
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens
@classmethod
def load(
cls,
meta: Dict[Text, Any],
model_dir: Optional[Text] = None,
model_metadata: Optional["Metadata"] = None,
cached_component: Optional[Component] = None,
**kwargs: Any
) -> "JiebaTokenizer":
relative_dictionary_path = meta.get("dictionary_path")
# get real path of dictionary path, if any
if relative_dictionary_path is not None:
dictionary_path = os.path.join(model_dir, relative_dictionary_path)
meta["dictionary_path"] = dictionary_path
return cls(meta)
@staticmethod
def copy_files_dir_to_dir(input_dir, output_dir):
# make sure target path exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)
target_file_list = glob.glob("{}/*".format(input_dir))
for target_file in target_file_list:
shutil.copy2(target_file, output_dir)
def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
"""Persist this model into the passed directory."""
# copy custom dictionaries to model dir, if any
if self.dictionary_path is not None:
target_dictionary_path = os.path.join(model_dir, file_name)
self.copy_files_dir_to_dir(self.dictionary_path, target_dictionary_path)
return {"dictionary_path": file_name}
else:
return {"dictionary_path": None}
|