1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
# rasa/nlu/featurizers/count_vectors_featurizer.py:485
def train(
self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any
) -> None:
"""Train the featurizer.
Take parameters from config and
construct a new count vectorizer using the sklearn framework.
"""
spacy_nlp = kwargs.get("spacy_nlp") # None
if spacy_nlp is not None:
# create spacy lemma_ for OOV_words
self.OOV_words = [t.lemma_ for w in self.OOV_words for t in spacy_nlp(w)]
# process sentences and collect data for all attributes
# 具体内容:
# {'text': ['hey', 'hello', 'hi', 'good morning', 'good evening', 'hey there', 'bye', 'goodbye', 'see you around', 'see you later', 'yes', 'indeed', 'of course', 'that sounds good', 'correct', 'no', 'never', 'i don t think so', 'don t like that', 'no way', 'not really', 'perfect', 'very good', 'great', 'amazing', 'wonderful', 'i am feeling very good', 'i am great', 'i m good', 'sad', 'very sad', 'unhappy', 'bad', 'very bad', 'awful', 'terrible', 'not very good', 'extremely sad', 'so sad', 'are you a bot', 'are you a human', 'am i talking to a bot', 'am i talking to a human'],
# 'intent': ['greet', 'greet', 'greet', 'greet', 'greet', 'greet', 'goodbye', 'goodbye', 'goodbye', 'goodbye', 'affirm', 'affirm', 'affirm', 'affirm', 'affirm', 'deny', 'deny', 'deny', 'deny', 'deny', 'deny', 'mood_great', 'mood_great', 'mood_great', 'mood_great', 'mood_great', 'mood_great', 'mood_great', 'mood_great', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'mood_unhappy', 'bot_challenge', 'bot_challenge', 'bot_challenge', 'bot_challenge'],
# 'response': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']}
processed_attribute_texts = self._get_all_attributes_processed_texts(
training_data
)
# 针对processed_attribute_texts中的内容使用,
# sklearn.feature_extraction.text.CountVectorizer进行编码
# train for all attributes
if self.use_shared_vocab: # false
self._train_with_shared_vocab(processed_attribute_texts)
else:
self._train_with_independent_vocab(processed_attribute_texts)
# transform for all attributes
for attribute in MESSAGE_ATTRIBUTES:
attribute_features = self._get_featurized_attribute(
attribute, processed_attribute_texts[attribute]
)
# 将编码后的特征设置到训练数据中
if attribute_features is not None:
self._set_attribute_features(
attribute, attribute_features, training_data
)
|