[Python, Data Mining, Natural Language Processing] Обзор методов создания эмбедингов предложений, Часть2
Автор
Сообщение
news_bot ®
Стаж: 6 лет 9 месяцев
Сообщений: 27286
Здравствуйте, продолжение статьи про методы создания эмбедингов предложений. В этом гайде мало слов и много кода, готово для Ctrl+с, Ctrl+v для удучшений и дальнейших тестов.
Часть1 обязательна для ознакомления
4. BERT
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder
# ссылка для скачивания моделей http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html
4.1 rubert_cased_L-12_H-768_A-12_pt
class RU_BERT_CLASS:
def __init__(self, name):
bert_config = read_json(configs.embedder.bert_embedder)
bert_config['metadata']['variables']['BERT_PATH'] = os.path.join('./.', name)
self.m = build_model(bert_config)
def vectorizer(self, sentences):
return [sentence.split() for sentence in sentences]
def predict(self, tokens):
_, _, _, _, sent_max_embs, sent_mean_embs, _ = self.m(tokens)
return sent_mean_embs
bert = RU_BERT_CLASS('rubert_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'rubert')
'rubert: 2895.7'
4.2 ru_conversational_cased_L-12_H-768_A-12_pt
bert = RU_BERT_CLASS('ru_conversational_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'ru_conversational')
'ru_conversational: 3559.1'
4.3 sentence_ru_cased_L-12_H-768_A-12_pt
bert = RU_BERT_CLASS('sentence_ru_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'sentence_ru')
'sentence_ru: 2660.2'
4.4 elmo_ru-news_wmt11-16_1.5M_steps
class ELMO_CLASS(RU_BERT_CLASS):
def __init__(self, name):
self.m = ELMoEmbedder(f"http://files.deeppavlov.ai/deeppavlov_data/{name}")
def predict(self, tokens):
return self.m(tokens)
elmo = ELMO_CLASS('elmo_ru-news_wmt11-16_1.5M_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-news')
'elmo_ru-news: 4631.3'
4.5 elmo_ru-wiki_600k_steps
elmo = ELMO_CLASS('elmo_ru-wiki_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-wiki')
'elmo_ru-wiki: 4507.6'
4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps
elmo = ELMO_CLASS('elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances)
evaluate(get_similarity_values, 'elmo_ru-twitter')
'elmo_ru-twitter: 2962.2'
plot_results()
5. Автоэнкодеры
Автоэнкодеры созданы для сжатия многомерного ветора до одномерного и, теоретически, должны идеально подойти для создания эмбедингов предложения.
5.1 Автоэнкодер embedings -> embedings
def models_builder(data_generator):
def cosine_loss(y_true, y_pred):
return K.mean(cosine_similarity(y_true, y_pred, axis=-1))
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
X = Flatten()(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='linear', name='embeding_output')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(data_generator.max_len*complexity, activation='elu')(X)
X = Reshape((data_generator.max_len, complexity))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Dense(data_generator.embedding_size, activation='elu')(X)
autoencoder = Model(inputs=inp, outputs=X)
autoencoder.compile(loss=cosine_loss, optimizer='adam')
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize, distance_function=cosine_distances)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> embedings')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
autoencoder.train_on_batch(x, x)
0 1770.2
3 212.6
6 138.8
9 84.8
12 78.1
15 106.4
18 112.7
21 79.7
5.2 Автоэнкодер embedings -> indexes
def models_builder(data_generator):
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X)
X = Flatten()(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(complexity, activation='linear', name='embeding_output')(X)
X = Dense(complexity, activation='elu')(X)
X = Dense(data_generator.max_len*complexity, activation='elu')(X)
X = Reshape((data_generator.max_len, complexity))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Dense(len(data_generator.token2index), activation='softmax')(X)
autoencoder = Model(inputs=inp, outputs=X)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x_e, x_i, y_i in data_generator:
autoencoder.train_on_batch(x_e, x_i)
0 1352.9
3 43.6
6 41.7
9 8.1
12 -5.6
15 43.1
18 36.1
21 -3.7
5.3 Автоэнкодер архитектура LSTM -> LSTM
def models_builder(data_generator):
def cosine_loss(y_true, y_pred):
return K.mean(cosine_similarity(y_true, y_pred, axis=-1))
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X, state_h, state_c = LSTM(complexity, return_state=True)(X)
X = Concatenate()([state_h, state_c])
X = Dense(complexity, activation='linear', name='embeding_output')(X)
state_c = Dense(complexity, activation='linear')(X)
state_h = Dense(complexity, activation='linear')(X)
inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
X = Dense(data_generator.embedding_size, activation='linear')(X)
autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
autoencoder.compile(loss=cosine_loss, optimizer='adam')
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = EmbedingsDataGenerator(use_fasttext=False)
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
autoencoder.train_on_batch([x, zeros], x)
0 1903.6
3 1299.3
6 313.5
9 445.3
12 454.9
15 447.7
18 454.5
21 448.1
5.4 Автоэнкодер архитектура LSTM -> LSTM -> indexes
def models_builder(data_generator):
complexity = 300
inp = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = inp
X, state_h, state_c = LSTM(complexity, return_state=True)(X)
X = Concatenate()([state_h, state_c])
X = Dense(complexity, activation='linear', name='embeding_output')(X)
state_c = Dense(complexity, activation='linear')(X)
state_h = Dense(complexity, activation='linear')(X)
inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size))
X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h])
X = Dense(len(data_generator.token2index), activation='softmax')(X)
autoencoder = Model(inputs=[inp, inp_zeros], outputs=X)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
autoencoder.summary()
embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output)
return autoencoder, embedder
data_generator = IndexesDataGenerator()
autoencoder, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'автоэнкодер архитектура LSTM -> LSTM -> indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x_e, x_i, y_i in data_generator:
autoencoder.train_on_batch([x_e, zeros], x_i)
0 1903.6
3 1483.3
6 1249.3
9 566.3
12 789.2
15 702.3
18 480.5
21 552.3
24 533.0
Методы с учителем
6. Эмбединги на Transfer Learning
TEXTS_CORPUS_WITH_LABEL = [(sentence, topic) for topic in texts_for_training for sentence in texts_for_training[topic]]
class BowDataGenerator(EmbedingsDataGenerator):
def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, batch_size=128, batches_per_epoch=100):
self.texts_topics = texts_topics
self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
self.batch_size = batch_size
self.batches_per_epoch = batches_per_epoch
self.count_vectorizer = CountVectorizer().fit([text_topic[0] for text_topic in self.texts_topics])
counts = Counter([text_topic[1] for text_topic in self.texts_topics])
self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}
def vectorize(self, sentences):
return self.count_vectorizer.transform(sentences).toarray()
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
y_batch = []
finished_batch = False
while not finished_batch:
text, topic = random.choice(self.texts_topics)
X_batch.append(text)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
X_batch = self.count_vectorizer.transform(X_batch).toarray()
y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
yield np.array(X_batch), np.array(y_batch)
finished_batch = True
data_generator = BowDataGenerator()
6.1 Эмбединги на основе BOW
def models_builder(data_generator):
complexity = 500
inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
X = inp
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('elu')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = BowDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'ембединг на BOW')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 601.4
3 1175.4
6 1187.0
9 1175.9
12 1097.9
15 1083.4
18 1083.8
21 1060.5
6.2 Эмбединг на LSTM + MaxPooling (InferSent)
Сыылки на стать:
Arxiv с теорией
Объяснено по-человечески
class LabelsDataGenerator(EmbedingsDataGenerator):
def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, target_len=20, batch_size=128, batches_per_epoch=100, use_word2vec=True, use_fasttext=True):
self.texts_topics = texts_topics
self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})}
self.target_len = target_len
self.batch_size = batch_size
self.batches_per_epoch = batches_per_epoch
self.use_word2vec = use_word2vec
self.use_fasttext = use_fasttext
self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext))
counts = Counter([text_topic[1] for text_topic in self.texts_topics])
self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts}
def vectorize(self, sentences):
vectorized = []
for text in sentences:
tokens = str(text).split()
x_vec = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
x_vec.append(token_vec)
vectorized.append(x_vec)
vectorized = pad_sequences(vectorized, maxlen=self.target_len)
return vectorized
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
X_batch = []
y_batch = []
finished_batch = False
while not finished_batch:
text, topic = random.choice(self.texts_topics)
tokens = text.split()
x_vec = []
for token in tokens:
token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)
if len(x_vec) >= self.target_len:
X_batch.append(x_vec)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
break
x_vec.append(token_vec)
else:
X_batch.append(x_vec)
y_batch.append(self.topic2index[topic])
if len(X_batch) >= self.batch_size:
X_batch = pad_sequences(X_batch, maxlen=self.target_len)
y_batch = to_categorical(y_batch, num_classes=len(self.topic2index))
yield np.array(X_batch), np.array(y_batch)
finished_batch = True
def models_builder(data_generator):
complexity = 768
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X = inp
X = Bidirectional(LSTM(complexity, return_sequences=True))(X)
X = Permute((2,1))(X)
X = MaxPooling1D(pool_size=600)(X)
X = Flatten()(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('sigmoid')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'эмбединг на LSTM + MaxPooling')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 87.0
3 152.1
6 110.5
9 146.7
12 166.2
15 79.8
18 47.2
21 84.0
24 144.8
27 83.8
6.3 Эмбединг на LSTM + Conv1D + AveragePooling
def models_builder(data_generator):
complexity = 600
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X_R = inp
X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R)
X_C = inp
X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C)
X = Concatenate()([X_R, X_C])
X = AveragePooling1D(pool_size=2)(X)
X = Conv1D(complexity, 3, strides=1, padding='same')(X)
X = AveragePooling1D(pool_size=2)(X)
X = Conv1D(complexity, 3, strides=1, padding='same')(X)
X = AveragePooling1D(pool_size=2)(X)
X = Flatten()(X)
X = Dense(complexity)(X)
X = Activation('sigmoid')(X)
X = Dense(complexity, name = 'embeding_output')(X)
X = Activation('elu')(X)
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
0 353.8
3 -147.8
6 7.6
9 5.5
12 -133.6
15 -133.6
18 9.0
21 9.0
24 -133.6
6.4 Эмбединг на LSTM + Inception + Attention
def models_builder(data_generator):
rate = 0.20
complexity = 500
def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
X = Activation('elu')(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
X = Bidirectional(LSTM(int(complexity/2), return_sequences=True))(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def dense_layer(X, complexity, activation='elu', rate=0.2, regularizer=0, name=None):
X = Dense(int(complexity), name=name)(X)
X = Activation(activation)(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
inp = Input(shape=(data_generator.target_len, data_generator.embedding_size))
X = inp
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = MaxPooling1D(pool_size=2)(X)
R = inp
R = bi_LSTM(R, complexity)
R = bi_LSTM(R, complexity/2)
attention_probs = Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
R = multiply([R, attention_probs], name='attention_mul')
R = Dropout(rate)(R)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
X = Concatenate(axis=-1)([X, R])
X = Flatten()(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity, activation='sigmoid')
X = dense_layer(X, complexity, name='embeding_output')
X = Dense(len(data_generator.topic2index), activation='softmax')(X)
model = Model(inputs=inp, outputs=X)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output)
return model, embedder
data_generator = LabelsDataGenerator()
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'эмбединг на LSTM + Inception + Attention')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for x, y in data_generator:
model.train_on_batch(x, y, class_weight=data_generator.class_weight)
0 275.0
3 126.8
6 173.9
9 155.5
12 168.4
15 287.2
18 382.8
21 303.4
plot_results()
7 Triplet loss
Обучение будет происходит на том, что мы векторы из одного интента должны распологаться ближе друг к другу, а из разных интентов, дальше. Тем самым предложения, иемющие похожий смысл будут стоять ближ друг к другу, а разный, будут отстоять друг от друга.
Подробнее про Triplet loss вот тут
7.1 Triplet loss на BOW
class TripletDataGeneratorIndexes(BowDataGenerator):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.database = {}
for text, topic in self.texts_topics:
if topic not in self.database:
self.database[topic] = []
self.database[topic].append(text)
# почистим все интенты с <5 сообщениями
sh_database = {}
for topic in self.database:
if len(self.database[topic]) > 5:
sh_database[topic] = self.database[topic]
self.database = sh_database
self.all_topics = [topic for topic in self.database]
def __iter__(self):
for _ in tqdm(range(self.batches_per_epoch), leave=False):
anchor = []
positive = []
negative = []
for _ in range(self.batch_size):
anchor_topic = random.choice(self.all_topics)
anchor_index = np.random.randint(len(self.database[anchor_topic]))
positive_index = np.random.randint(len(self.database[anchor_topic]))
while positive_index == anchor_index:
positive_index = np.random.randint(len(self.database[anchor_topic]))
negative_topic = random.choice(self.all_topics)
while negative_topic == anchor_topic:
negative_topic = random.choice(self.all_topics)
negative_index = np.random.randint(len(self.database[negative_topic]))
anchor.append(self.database[anchor_topic][anchor_index])
positive.append(self.database[anchor_topic][positive_index])
negative.append(self.database[negative_topic][negative_index])
yield self.vectorize(anchor), self.vectorize(positive), self.vectorize(negative)
def models_builder(data_generator):
sentence_embeding_size = 100
def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
"""
Implementation of the triplet loss function
Arguments:
y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor data
positive -- the encodings for the positive data (similar to anchor)
negative -- the encodings for the negative data (different from anchor)
N -- The number of dimension
beta -- The scaling factor, N is recommended
epsilon -- The Epsilon value to prevent ln(0)
Returns:
loss -- real number, value of the loss
"""
anchor = tf.convert_to_tensor(y_pred[:,0:N])
positive = tf.convert_to_tensor(y_pred[:,N:N*2])
negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])
# distance between the anchor and the positive
pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1)
# distance between the anchor and the negative
neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1)
#Non Linear Values
pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)
# compute loss
loss = neg_dist + pos_dist
return loss
def basic_sentence_vectorizer():
inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
X = inp
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
X = Activation('elu')(X)
X = Dense(complexity, name='embeding_output')(X)
X = Activation('elu')(X)
X = Dense(complexity)(X)
vectorizer = Model(inputs=inp, outputs=X)
return vectorizer
complexity = 300
inp_anchor = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
inp_positive = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
inp_negative = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),))
embedder = basic_sentence_vectorizer()
anchor = embedder(inp_anchor)
positive = embedder(inp_positive)
negative = embedder(inp_negative)
output = Concatenate(axis=1)([anchor, positive, negative])
model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
model.summary()
return model, embedder
data_generator = TripletDataGeneratorIndexes(batch_size=128, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'triplet loss indexes')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i > 20:
break
for a, p, n in data_generator:
model.train_on_batch([a, p, n], zeros)
0 724.1
3 -143.5
6 11.7
9 36.2
12 -123.5
15 150.1
18 -51.9
21 5.0
24 -43.5
7.2 Triplet loss на embedings
class TripletDataGeneratorEmbedings(TripletDataGeneratorIndexes):
def __init__(self, *args, **kwargs):
super().__init__()
self.target_len = kwargs['target_len']
self.embedding_size = len(vectorize('any_token'))
self.use_word2vec = True
self.use_fasttext = True
self.batches_per_epoch = kwargs['batches_per_epoch']
def vectorize(self, sentences):
return LabelsDataGenerator.vectorize(self, sentences)
def models_builder(data_generator):
sentence_embeding_size = 300
def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8):
"""
Implementation of the triplet loss function
Arguments:
y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
y_pred -- python list containing three objects:
anchor -- the encodings for the anchor data
positive -- the encodings for the positive data (similar to anchor)
negative -- the encodings for the negative data (different from anchor)
N -- The number of dimension
beta -- The scaling factor, N is recommended
epsilon -- The Epsilon value to prevent ln(0)
Returns:
loss -- real number, value of the loss
"""
anchor = tf.convert_to_tensor(y_pred[:,0:N])
positive = tf.convert_to_tensor(y_pred[:,N:N*2])
negative = tf.convert_to_tensor(y_pred[:,N*2:N*3])
# distance between the anchor and the positive
pos_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,positive)),1)
# distance between the anchor and the negative
neg_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,negative)),1)
#Non Linear Values
pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon)
neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon)
# compute loss
loss = neg_dist + pos_dist
return loss
def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0):
X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X)
X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X)
X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X)
X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X)
X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X)
X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X)
X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X)
X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1])
X = Activation('elu')(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
return X
def bi_LSTM(X, complexity, rate=0.2, regularizer=0):
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(complexity/2), return_sequences=True))(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dropout(rate)(X)
return X
def dense_layer(X, complexity, rate=0.2, regularizer=0):
X = tf.keras.layers.Dense(int(complexity))(X)
X = tf.keras.layers.Activation('elu')(X)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dropout(rate)(X)
return X
def basic_sentence_vectorizer():
rate = 0.20
complexity = 300
inp = Input(shape = (data_generator.target_len, data_generator.embedding_size))
X = inp
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
X = inception_convolutional_layer(X, complexity)
X = tf.keras.layers.MaxPooling1D(pool_size=2)(X)
R = inp
R = bi_LSTM(R, complexity)
R = bi_LSTM(R, complexity/2)
attention_probs = tf.keras.layers.Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R)
R = multiply([R, attention_probs], name='attention_mul')
R = Dropout(rate)(R)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
R = inception_convolutional_layer(R, complexity)
R = MaxPooling1D(pool_size=2)(R)
X = Concatenate(axis=-1)([X, R])
X = Flatten()(X)
X = BatchNormalization()(X)
X = Dropout(rate)(X)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity)
X = dense_layer(X, complexity)
X = Dense(sentence_embeding_size, activation='sigmoid')(X)
vectorizer = Model(inputs=inp, outputs=X)
return vectorizer
inp_anchor = Input(shape = (data_generator.target_len, data_generator.embedding_size))
inp_positive = Input(shape = (data_generator.target_len, data_generator.embedding_size))
inp_negative = Input(shape = (data_generator.target_len, data_generator.embedding_size))
embedder = basic_sentence_vectorizer()
anchor = embedder(inp_anchor)
positive = embedder(inp_positive)
negative = embedder(inp_negative)
output = Concatenate(axis=1)([anchor, positive, negative])
model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output)
model.compile(optimizer='adagrad', loss=lossless_triplet_loss)
model.summary()
return model, embedder
data_generator = TripletDataGeneratorEmbedings(target_len=20, batch_size=32, batches_per_epoch=10000)
model, embedder = models_builder(data_generator)
get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize)
zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5
for i in tqdm(range(1000)):
if i%3==0:
previous_result = new_result
new_result = evaluate(get_similarity_values, 'triplet loss embeding')
new_result = parse_result(new_result)
print(i, new_result)
if new_result < previous_result and i>20:
break
for a, p, n in data_generator:
model.train_on_batch([a, p, n], zeros)
0 283.9
3 334.2
6 218.1
9 219.6
12 262.8
15 282.4
18 289.7
21 274.9
plot_results()
Итоги
Можно было предсказать, что победителями будут модели ELMO т.к. они были созданы для векторизации предложений. Их можно смело использовать, когда вам нужно быстро извлечь фичи из текста.
Лично меня приятно удивил BOW и среднее по эмбедингам. Даже без учёта порядка слов, они смогли поставить предложения из одной темы рядом.
Был разочарован автоэнкодерами. Сразу после инициализации результат лучше, чем после обучения. Не могу сказать в чём проблема, скорее всего автоэнкодер не может сжать всё предложение правильно и начинает предсказывать нули. Если у вас будут идеи по улучшению, то жду в комментариях.
Мой личный фаворит Triplet loss на embedings тоже не дал выдающегося результата. Думаю, что он раскроет свой потенциал на моделях в 100 раз больше по размеру и с обучением в течении нескольких месяцев.
Два метода: BOW с леммами без стоп слов и среднее с весами tf-idf хоть и не дают выдающихся средних результатов, но для некоторых предложений дают очень и очень хороший результат. Поэтому, для этих методов, всё должно зависеть от данных.
Вероятно, что со временем будет и Часть 3, если наберу достаточное количество идей.
===========
Источник:
habr.com
===========
Похожие новости:
- [Python, Data Mining, Natural Language Processing] Обзор методов создания эмбедингов предложений, Часть 1
- [Python] Выявляем признаки аудиомонтажа методами AI
- [Ненормальное программирование, Python, Управление разработкой] «Я что-то накодил и все упало»: провалы в Python-разработке на Russian Python Week 2020
- [IT-инфраструктура, Open source, Python] Как я делал Telegram-бота для работы с сетью
- [Open source, Python, Алгоритмы] Алгоритм ранжирования сегментов речной сети с использованием графов для геоинформационного анализа
- [Python, Периферия, Игры и игровые приставки] Может ли геймпад заменить клавиатуру? Пробуем программировать на стиках (перевод)
- [Open source, Python, Обработка изображений, Машинное обучение, Искусственный интеллект] Multi-Target в Albumentations (перевод)
- [Python, Программирование] Три редко используемых возможности Python 3, о которых каждый должен знать (перевод)
- [Python, Программирование] Как не потерять ход времени работая за компьютером. Приложение по мониторингу работы и введению статистики
- [Python, Проектирование и рефакторинг] Мониторинг демон на Asyncio + Dependency Injector — руководство по применению dependency injection
Теги для поиска: #_python, #_data_mining, #_natural_language_processing, #_python, #_nlp_(natural_language_processing), #_embeddings, #_python, #_data_mining, #_natural_language_processing
Вы не можете начинать темы
Вы не можете отвечать на сообщения
Вы не можете редактировать свои сообщения
Вы не можете удалять свои сообщения
Вы не можете голосовать в опросах
Вы не можете прикреплять файлы к сообщениям
Вы не можете скачивать файлы
Текущее время: 22-Ноя 15:11
Часовой пояс: UTC + 5
Автор | Сообщение |
---|---|
news_bot ®
Стаж: 6 лет 9 месяцев |
|
Здравствуйте, продолжение статьи про методы создания эмбедингов предложений. В этом гайде мало слов и много кода, готово для Ctrl+с, Ctrl+v для удучшений и дальнейших тестов. Часть1 обязательна для ознакомления 4. BERT from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder # ссылка для скачивания моделей http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html 4.1 rubert_cased_L-12_H-768_A-12_pt class RU_BERT_CLASS:
def __init__(self, name): bert_config = read_json(configs.embedder.bert_embedder) bert_config['metadata']['variables']['BERT_PATH'] = os.path.join('./.', name) self.m = build_model(bert_config) def vectorizer(self, sentences): return [sentence.split() for sentence in sentences] def predict(self, tokens): _, _, _, _, sent_max_embs, sent_mean_embs, _ = self.m(tokens) return sent_mean_embs bert = RU_BERT_CLASS('rubert_cased_L-12_H-768_A-12_pt') get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'rubert') 'rubert: 2895.7' 4.2 ru_conversational_cased_L-12_H-768_A-12_pt bert = RU_BERT_CLASS('ru_conversational_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'ru_conversational') 'ru_conversational: 3559.1' 4.3 sentence_ru_cased_L-12_H-768_A-12_pt bert = RU_BERT_CLASS('sentence_ru_cased_L-12_H-768_A-12_pt')
get_similarity_values = similarity_values_wrapper(bert.predict, bert.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'sentence_ru') 'sentence_ru: 2660.2' 4.4 elmo_ru-news_wmt11-16_1.5M_steps class ELMO_CLASS(RU_BERT_CLASS):
def __init__(self, name): self.m = ELMoEmbedder(f"http://files.deeppavlov.ai/deeppavlov_data/{name}") def predict(self, tokens): return self.m(tokens) elmo = ELMO_CLASS('elmo_ru-news_wmt11-16_1.5M_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'elmo_ru-news') 'elmo_ru-news: 4631.3' 4.5 elmo_ru-wiki_600k_steps elmo = ELMO_CLASS('elmo_ru-wiki_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'elmo_ru-wiki') 'elmo_ru-wiki: 4507.6' 4.6 elmo_ru-twitter_2013-01_2018-04_600k_steps elmo = ELMO_CLASS('elmo_ru-twitter_2013-01_2018-04_600k_steps.tar.gz')
get_similarity_values = similarity_values_wrapper(elmo.predict, elmo.vectorizer, distance_function=cosine_distances) evaluate(get_similarity_values, 'elmo_ru-twitter') 'elmo_ru-twitter: 2962.2' plot_results()
5. Автоэнкодеры Автоэнкодеры созданы для сжатия многомерного ветора до одномерного и, теоретически, должны идеально подойти для создания эмбедингов предложения. 5.1 Автоэнкодер embedings -> embedings def models_builder(data_generator):
def cosine_loss(y_true, y_pred): return K.mean(cosine_similarity(y_true, y_pred, axis=-1)) complexity = 300 inp = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = inp X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X) X = Flatten()(X) X = Dense(complexity, activation='elu')(X) X = Dense(complexity, activation='elu')(X) X = Dense(complexity, activation='linear', name='embeding_output')(X) X = Dense(complexity, activation='elu')(X) X = Dense(data_generator.max_len*complexity, activation='elu')(X) X = Reshape((data_generator.max_len, complexity))(X) X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Dense(data_generator.embedding_size, activation='elu')(X) autoencoder = Model(inputs=inp, outputs=X) autoencoder.compile(loss=cosine_loss, optimizer='adam') autoencoder.summary() embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output) return autoencoder, embedder data_generator = EmbedingsDataGenerator(use_fasttext=False) autoencoder, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize, distance_function=cosine_distances) new_result = -10e5
for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> embedings') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x, y in data_generator: autoencoder.train_on_batch(x, x) 0 1770.2 3 212.6 6 138.8 9 84.8 12 78.1 15 106.4 18 112.7 21 79.7 5.2 Автоэнкодер embedings -> indexes def models_builder(data_generator):
complexity = 300 inp = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = inp X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Bidirectional(LSTM(int(complexity/10), return_sequences=True))(X) X = Flatten()(X) X = Dense(complexity, activation='elu')(X) X = Dense(complexity, activation='elu')(X) X = Dense(complexity, activation='linear', name='embeding_output')(X) X = Dense(complexity, activation='elu')(X) X = Dense(data_generator.max_len*complexity, activation='elu')(X) X = Reshape((data_generator.max_len, complexity))(X) X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Dense(len(data_generator.token2index), activation='softmax')(X) autoencoder = Model(inputs=inp, outputs=X) autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) autoencoder.summary() embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output) return autoencoder, embedder data_generator = IndexesDataGenerator() autoencoder, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) new_result = -10e5
for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> indexes') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x_e, x_i, y_i in data_generator: autoencoder.train_on_batch(x_e, x_i) 0 1352.9 3 43.6 6 41.7 9 8.1 12 -5.6 15 43.1 18 36.1 21 -3.7 5.3 Автоэнкодер архитектура LSTM -> LSTM def models_builder(data_generator):
def cosine_loss(y_true, y_pred): return K.mean(cosine_similarity(y_true, y_pred, axis=-1)) complexity = 300 inp = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = inp X, state_h, state_c = LSTM(complexity, return_state=True)(X) X = Concatenate()([state_h, state_c]) X = Dense(complexity, activation='linear', name='embeding_output')(X) state_c = Dense(complexity, activation='linear')(X) state_h = Dense(complexity, activation='linear')(X) inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h]) X = Dense(data_generator.embedding_size, activation='linear')(X) autoencoder = Model(inputs=[inp, inp_zeros], outputs=X) autoencoder.compile(loss=cosine_loss, optimizer='adam') autoencoder.summary() embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output) return autoencoder, embedder data_generator = EmbedingsDataGenerator(use_fasttext=False) autoencoder, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5 for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'автоэнкодер embedings -> indexes') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x, y in data_generator: autoencoder.train_on_batch([x, zeros], x) 0 1903.6 3 1299.3 6 313.5 9 445.3 12 454.9 15 447.7 18 454.5 21 448.1 5.4 Автоэнкодер архитектура LSTM -> LSTM -> indexes def models_builder(data_generator):
complexity = 300 inp = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = inp X, state_h, state_c = LSTM(complexity, return_state=True)(X) X = Concatenate()([state_h, state_c]) X = Dense(complexity, activation='linear', name='embeding_output')(X) state_c = Dense(complexity, activation='linear')(X) state_h = Dense(complexity, activation='linear')(X) inp_zeros = Input(shape=(data_generator.max_len, data_generator.embedding_size)) X = LSTM(complexity, return_sequences=True)(inp_zeros, [state_c, state_h]) X = Dense(len(data_generator.token2index), activation='softmax')(X) autoencoder = Model(inputs=[inp, inp_zeros], outputs=X) autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) autoencoder.summary() embedder = Model(inputs=inp, outputs=autoencoder.get_layer('embeding_output').output) return autoencoder, embedder data_generator = IndexesDataGenerator() autoencoder, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) zeros = np.zeros((data_generator.batch_size, data_generator.max_len, data_generator.embedding_size))
new_result = -10e5 for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'автоэнкодер архитектура LSTM -> LSTM -> indexes') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x_e, x_i, y_i in data_generator: autoencoder.train_on_batch([x_e, zeros], x_i) 0 1903.6 3 1483.3 6 1249.3 9 566.3 12 789.2 15 702.3 18 480.5 21 552.3 24 533.0 Методы с учителем 6. Эмбединги на Transfer Learning TEXTS_CORPUS_WITH_LABEL = [(sentence, topic) for topic in texts_for_training for sentence in texts_for_training[topic]]
class BowDataGenerator(EmbedingsDataGenerator): def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, batch_size=128, batches_per_epoch=100): self.texts_topics = texts_topics self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})} self.batch_size = batch_size self.batches_per_epoch = batches_per_epoch self.count_vectorizer = CountVectorizer().fit([text_topic[0] for text_topic in self.texts_topics]) counts = Counter([text_topic[1] for text_topic in self.texts_topics]) self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts} def vectorize(self, sentences): return self.count_vectorizer.transform(sentences).toarray() def __iter__(self): for _ in tqdm(range(self.batches_per_epoch), leave=False): X_batch = [] y_batch = [] finished_batch = False while not finished_batch: text, topic = random.choice(self.texts_topics) X_batch.append(text) y_batch.append(self.topic2index[topic]) if len(X_batch) >= self.batch_size: X_batch = self.count_vectorizer.transform(X_batch).toarray() y_batch = to_categorical(y_batch, num_classes=len(self.topic2index)) yield np.array(X_batch), np.array(y_batch) finished_batch = True data_generator = BowDataGenerator() 6.1 Эмбединги на основе BOW def models_builder(data_generator):
complexity = 500 inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),)) X = inp X = Dense(complexity)(X) X = Activation('elu')(X) X = Dense(complexity)(X) X = Activation('elu')(X) X = Dense(complexity, name='embeding_output')(X) X = Activation('elu')(X) X = Dense(len(data_generator.topic2index), activation='softmax')(X) model = Model(inputs=inp, outputs=X) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output) return model, embedder data_generator = BowDataGenerator() model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) new_result = -10e5
for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'ембединг на BOW') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x, y in data_generator: model.train_on_batch(x, y, class_weight=data_generator.class_weight) 0 601.4 3 1175.4 6 1187.0 9 1175.9 12 1097.9 15 1083.4 18 1083.8 21 1060.5 6.2 Эмбединг на LSTM + MaxPooling (InferSent) Сыылки на стать: Arxiv с теорией Объяснено по-человечески class LabelsDataGenerator(EmbedingsDataGenerator):
def __init__(self, texts_topics=TEXTS_CORPUS_WITH_LABEL, target_len=20, batch_size=128, batches_per_epoch=100, use_word2vec=True, use_fasttext=True): self.texts_topics = texts_topics self.topic2index = {topic: index for index, topic in enumerate({topic for text, topic in self.texts_topics})} self.target_len = target_len self.batch_size = batch_size self.batches_per_epoch = batches_per_epoch self.use_word2vec = use_word2vec self.use_fasttext = use_fasttext self.embedding_size = len(vectorize('token', use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext)) counts = Counter([text_topic[1] for text_topic in self.texts_topics]) self.class_weight = {self.topic2index[intent_id]:1/counts[intent_id] for intent_id in counts} def vectorize(self, sentences): vectorized = [] for text in sentences: tokens = str(text).split() x_vec = [] for token in tokens: token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext) x_vec.append(token_vec) vectorized.append(x_vec) vectorized = pad_sequences(vectorized, maxlen=self.target_len) return vectorized def __iter__(self): for _ in tqdm(range(self.batches_per_epoch), leave=False): X_batch = [] y_batch = [] finished_batch = False while not finished_batch: text, topic = random.choice(self.texts_topics) tokens = text.split() x_vec = [] for token in tokens: token_vec = vectorize(token, use_word2vec=self.use_word2vec, use_fasttext=self.use_fasttext) if len(x_vec) >= self.target_len: X_batch.append(x_vec) y_batch.append(self.topic2index[topic]) if len(X_batch) >= self.batch_size: break x_vec.append(token_vec) else: X_batch.append(x_vec) y_batch.append(self.topic2index[topic]) if len(X_batch) >= self.batch_size: X_batch = pad_sequences(X_batch, maxlen=self.target_len) y_batch = to_categorical(y_batch, num_classes=len(self.topic2index)) yield np.array(X_batch), np.array(y_batch) finished_batch = True def models_builder(data_generator):
complexity = 768 inp = Input(shape=(data_generator.target_len, data_generator.embedding_size)) X = inp X = Bidirectional(LSTM(complexity, return_sequences=True))(X) X = Permute((2,1))(X) X = MaxPooling1D(pool_size=600)(X) X = Flatten()(X) X = Dense(complexity)(X) X = Activation('elu')(X) X = Dense(complexity, name='embeding_output')(X) X = Activation('sigmoid')(X) X = Dense(len(data_generator.topic2index), activation='softmax')(X) model = Model(inputs=inp, outputs=X) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output) return model, embedder data_generator = LabelsDataGenerator() model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) new_result = -10e5
for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'эмбединг на LSTM + MaxPooling') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x, y in data_generator: model.train_on_batch(x, y, class_weight=data_generator.class_weight) 0 87.0 3 152.1 6 110.5 9 146.7 12 166.2 15 79.8 18 47.2 21 84.0 24 144.8 27 83.8 6.3 Эмбединг на LSTM + Conv1D + AveragePooling def models_builder(data_generator):
complexity = 600 inp = Input(shape=(data_generator.target_len, data_generator.embedding_size)) X_R = inp X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R) X_R = Bidirectional(LSTM(complexity, return_sequences=True))(X_R) X_C = inp X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C) X_C = Conv1D(complexity, 3, strides=1, padding='same')(X_C) X = Concatenate()([X_R, X_C]) X = AveragePooling1D(pool_size=2)(X) X = Conv1D(complexity, 3, strides=1, padding='same')(X) X = AveragePooling1D(pool_size=2)(X) X = Conv1D(complexity, 3, strides=1, padding='same')(X) X = AveragePooling1D(pool_size=2)(X) X = Flatten()(X) X = Dense(complexity)(X) X = Activation('sigmoid')(X) X = Dense(complexity, name = 'embeding_output')(X) X = Activation('elu')(X) X = Dense(len(data_generator.topic2index), activation='softmax')(X) model = Model(inputs=inp, outputs=X) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output) return model, embedder data_generator = LabelsDataGenerator() model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) 0 353.8 3 -147.8 6 7.6 9 5.5 12 -133.6 15 -133.6 18 9.0 21 9.0 24 -133.6 6.4 Эмбединг на LSTM + Inception + Attention def models_builder(data_generator):
rate = 0.20 complexity = 500 def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0): X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X) X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X) X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X) X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X) X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X) X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X) X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X) X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1]) X = Activation('elu')(X) X = BatchNormalization()(X) X = Dropout(rate)(X) return X def bi_LSTM(X, complexity, rate=0.2, regularizer=0): X = Bidirectional(LSTM(int(complexity/2), return_sequences=True))(X) X = BatchNormalization()(X) X = Dropout(rate)(X) return X def dense_layer(X, complexity, activation='elu', rate=0.2, regularizer=0, name=None): X = Dense(int(complexity), name=name)(X) X = Activation(activation)(X) X = BatchNormalization()(X) X = Dropout(rate)(X) return X inp = Input(shape=(data_generator.target_len, data_generator.embedding_size)) X = inp X = inception_convolutional_layer(X, complexity) X = inception_convolutional_layer(X, complexity) X = inception_convolutional_layer(X, complexity) X = MaxPooling1D(pool_size=2)(X) X = inception_convolutional_layer(X, complexity) X = MaxPooling1D(pool_size=2)(X) X = inception_convolutional_layer(X, complexity) X = MaxPooling1D(pool_size=2)(X) R = inp R = bi_LSTM(R, complexity) R = bi_LSTM(R, complexity/2) attention_probs = Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R) R = multiply([R, attention_probs], name='attention_mul') R = Dropout(rate)(R) R = MaxPooling1D(pool_size=2)(R) R = inception_convolutional_layer(R, complexity) R = MaxPooling1D(pool_size=2)(R) R = inception_convolutional_layer(R, complexity) R = MaxPooling1D(pool_size=2)(R) X = Concatenate(axis=-1)([X, R]) X = Flatten()(X) X = BatchNormalization()(X) X = Dropout(rate)(X) X = dense_layer(X, complexity) X = dense_layer(X, complexity, activation='sigmoid') X = dense_layer(X, complexity, name='embeding_output') X = Dense(len(data_generator.topic2index), activation='softmax')(X) model = Model(inputs=inp, outputs=X) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() embedder = Model(inputs=inp, outputs=model.get_layer('embeding_output').output) return model, embedder data_generator = LabelsDataGenerator() model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) new_result = -10e5
for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'эмбединг на LSTM + Inception + Attention') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for x, y in data_generator: model.train_on_batch(x, y, class_weight=data_generator.class_weight) 0 275.0 3 126.8 6 173.9 9 155.5 12 168.4 15 287.2 18 382.8 21 303.4 plot_results()
7 Triplet loss Обучение будет происходит на том, что мы векторы из одного интента должны распологаться ближе друг к другу, а из разных интентов, дальше. Тем самым предложения, иемющие похожий смысл будут стоять ближ друг к другу, а разный, будут отстоять друг от друга. Подробнее про Triplet loss вот тут 7.1 Triplet loss на BOW class TripletDataGeneratorIndexes(BowDataGenerator):
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.database = {} for text, topic in self.texts_topics: if topic not in self.database: self.database[topic] = [] self.database[topic].append(text) # почистим все интенты с <5 сообщениями sh_database = {} for topic in self.database: if len(self.database[topic]) > 5: sh_database[topic] = self.database[topic] self.database = sh_database self.all_topics = [topic for topic in self.database] def __iter__(self): for _ in tqdm(range(self.batches_per_epoch), leave=False): anchor = [] positive = [] negative = [] for _ in range(self.batch_size): anchor_topic = random.choice(self.all_topics) anchor_index = np.random.randint(len(self.database[anchor_topic])) positive_index = np.random.randint(len(self.database[anchor_topic])) while positive_index == anchor_index: positive_index = np.random.randint(len(self.database[anchor_topic])) negative_topic = random.choice(self.all_topics) while negative_topic == anchor_topic: negative_topic = random.choice(self.all_topics) negative_index = np.random.randint(len(self.database[negative_topic])) anchor.append(self.database[anchor_topic][anchor_index]) positive.append(self.database[anchor_topic][positive_index]) negative.append(self.database[negative_topic][negative_index]) yield self.vectorize(anchor), self.vectorize(positive), self.vectorize(negative) def models_builder(data_generator):
sentence_embeding_size = 100 def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8): """ Implementation of the triplet loss function Arguments: y_true -- true labels, required when you define a loss in Keras, you don't need it in this function. y_pred -- python list containing three objects: anchor -- the encodings for the anchor data positive -- the encodings for the positive data (similar to anchor) negative -- the encodings for the negative data (different from anchor) N -- The number of dimension beta -- The scaling factor, N is recommended epsilon -- The Epsilon value to prevent ln(0) Returns: loss -- real number, value of the loss """ anchor = tf.convert_to_tensor(y_pred[:,0:N]) positive = tf.convert_to_tensor(y_pred[:,N:N*2]) negative = tf.convert_to_tensor(y_pred[:,N*2:N*3]) # distance between the anchor and the positive pos_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,positive)),1) # distance between the anchor and the negative neg_dist = tf.reduce_sum(tf.square(tf.subtract(anchor,negative)),1) #Non Linear Values pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon) neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon) # compute loss loss = neg_dist + pos_dist return loss def basic_sentence_vectorizer(): inp = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),)) X = inp X = Dense(complexity)(X) X = Activation('elu')(X) X = Dense(complexity)(X) X = Activation('elu')(X) X = Dense(complexity, name='embeding_output')(X) X = Activation('elu')(X) X = Dense(complexity)(X) vectorizer = Model(inputs=inp, outputs=X) return vectorizer complexity = 300 inp_anchor = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),)) inp_positive = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),)) inp_negative = Input(shape=(len(data_generator.count_vectorizer.get_feature_names()),)) embedder = basic_sentence_vectorizer() anchor = embedder(inp_anchor) positive = embedder(inp_positive) negative = embedder(inp_negative) output = Concatenate(axis=1)([anchor, positive, negative]) model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output) model.compile(optimizer='adagrad', loss=lossless_triplet_loss) model.summary() return model, embedder data_generator = TripletDataGeneratorIndexes(batch_size=128, batches_per_epoch=10000) model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5 for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'triplet loss indexes') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i > 20: break for a, p, n in data_generator: model.train_on_batch([a, p, n], zeros) 0 724.1 3 -143.5 6 11.7 9 36.2 12 -123.5 15 150.1 18 -51.9 21 5.0 24 -43.5 7.2 Triplet loss на embedings class TripletDataGeneratorEmbedings(TripletDataGeneratorIndexes):
def __init__(self, *args, **kwargs): super().__init__() self.target_len = kwargs['target_len'] self.embedding_size = len(vectorize('any_token')) self.use_word2vec = True self.use_fasttext = True self.batches_per_epoch = kwargs['batches_per_epoch'] def vectorize(self, sentences): return LabelsDataGenerator.vectorize(self, sentences) def models_builder(data_generator):
sentence_embeding_size = 300 def lossless_triplet_loss(y_true, y_pred, N=sentence_embeding_size, beta=100, epsilon=1e-8): """ Implementation of the triplet loss function Arguments: y_true -- true labels, required when you define a loss in Keras, you don't need it in this function. y_pred -- python list containing three objects: anchor -- the encodings for the anchor data positive -- the encodings for the positive data (similar to anchor) negative -- the encodings for the negative data (different from anchor) N -- The number of dimension beta -- The scaling factor, N is recommended epsilon -- The Epsilon value to prevent ln(0) Returns: loss -- real number, value of the loss """ anchor = tf.convert_to_tensor(y_pred[:,0:N]) positive = tf.convert_to_tensor(y_pred[:,N:N*2]) negative = tf.convert_to_tensor(y_pred[:,N*2:N*3]) # distance between the anchor and the positive pos_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,positive)),1) # distance between the anchor and the negative neg_dist = tf.math.reduce_sum(tf.math.square(tf.math.subtract(anchor,negative)),1) #Non Linear Values pos_dist = -tf.math.log(-tf.math.divide((pos_dist),beta)+1+epsilon) neg_dist = -tf.math.log(-tf.math.divide((N-neg_dist),beta)+1+epsilon) # compute loss loss = neg_dist + pos_dist return loss def inception_convolutional_layer(X, complexity, rate=0.2, regularizer=0): X_7 = Conv1D(int(complexity/7), kernel_size=7, strides=1, padding='same')(X) X_6 = Conv1D(int(complexity/6), kernel_size=6, strides=1, padding='same')(X) X_5 = Conv1D(int(complexity/5), kernel_size=5, strides=1, padding='same')(X) X_4 = Conv1D(int(complexity/4), kernel_size=4, strides=1, padding='same')(X) X_3 = Conv1D(int(complexity/3), kernel_size=3, strides=1, padding='same')(X) X_2 = Conv1D(int(complexity/2), kernel_size=2, strides=1, padding='same')(X) X_1 = Conv1D(int(complexity/1), kernel_size=1, strides=1, padding='same')(X) X = Concatenate()([X_7, X_6, X_5, X_4, X_3, X_2, X_1]) X = Activation('elu')(X) X = BatchNormalization()(X) X = Dropout(rate)(X) return X def bi_LSTM(X, complexity, rate=0.2, regularizer=0): X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(int(complexity/2), return_sequences=True))(X) X = tf.keras.layers.BatchNormalization()(X) X = tf.keras.layers.Dropout(rate)(X) return X def dense_layer(X, complexity, rate=0.2, regularizer=0): X = tf.keras.layers.Dense(int(complexity))(X) X = tf.keras.layers.Activation('elu')(X) X = tf.keras.layers.BatchNormalization()(X) X = tf.keras.layers.Dropout(rate)(X) return X def basic_sentence_vectorizer(): rate = 0.20 complexity = 300 inp = Input(shape = (data_generator.target_len, data_generator.embedding_size)) X = inp X = inception_convolutional_layer(X, complexity) X = inception_convolutional_layer(X, complexity) X = inception_convolutional_layer(X, complexity) X = tf.keras.layers.MaxPooling1D(pool_size=2)(X) X = inception_convolutional_layer(X, complexity) X = tf.keras.layers.MaxPooling1D(pool_size=2)(X) X = inception_convolutional_layer(X, complexity) X = tf.keras.layers.MaxPooling1D(pool_size=2)(X) R = inp R = bi_LSTM(R, complexity) R = bi_LSTM(R, complexity/2) attention_probs = tf.keras.layers.Dense(int(complexity/2), activation='sigmoid', name='attention_probs')(R) R = multiply([R, attention_probs], name='attention_mul') R = Dropout(rate)(R) R = MaxPooling1D(pool_size=2)(R) R = inception_convolutional_layer(R, complexity) R = MaxPooling1D(pool_size=2)(R) R = inception_convolutional_layer(R, complexity) R = MaxPooling1D(pool_size=2)(R) X = Concatenate(axis=-1)([X, R]) X = Flatten()(X) X = BatchNormalization()(X) X = Dropout(rate)(X) X = dense_layer(X, complexity) X = dense_layer(X, complexity) X = dense_layer(X, complexity) X = Dense(sentence_embeding_size, activation='sigmoid')(X) vectorizer = Model(inputs=inp, outputs=X) return vectorizer inp_anchor = Input(shape = (data_generator.target_len, data_generator.embedding_size)) inp_positive = Input(shape = (data_generator.target_len, data_generator.embedding_size)) inp_negative = Input(shape = (data_generator.target_len, data_generator.embedding_size)) embedder = basic_sentence_vectorizer() anchor = embedder(inp_anchor) positive = embedder(inp_positive) negative = embedder(inp_negative) output = Concatenate(axis=1)([anchor, positive, negative]) model = Model(inputs=[inp_anchor, inp_positive, inp_negative], outputs=output) model.compile(optimizer='adagrad', loss=lossless_triplet_loss) model.summary() return model, embedder data_generator = TripletDataGeneratorEmbedings(target_len=20, batch_size=32, batches_per_epoch=10000) model, embedder = models_builder(data_generator) get_similarity_values = similarity_values_wrapper(embedder.predict, data_generator.vectorize) zeros = np.zeros((data_generator.batch_size, 1, 1))
new_result = -10e5 for i in tqdm(range(1000)): if i%3==0: previous_result = new_result new_result = evaluate(get_similarity_values, 'triplet loss embeding') new_result = parse_result(new_result) print(i, new_result) if new_result < previous_result and i>20: break for a, p, n in data_generator: model.train_on_batch([a, p, n], zeros) 0 283.9 3 334.2 6 218.1 9 219.6 12 262.8 15 282.4 18 289.7 21 274.9 plot_results()
Итоги Можно было предсказать, что победителями будут модели ELMO т.к. они были созданы для векторизации предложений. Их можно смело использовать, когда вам нужно быстро извлечь фичи из текста. Лично меня приятно удивил BOW и среднее по эмбедингам. Даже без учёта порядка слов, они смогли поставить предложения из одной темы рядом. Был разочарован автоэнкодерами. Сразу после инициализации результат лучше, чем после обучения. Не могу сказать в чём проблема, скорее всего автоэнкодер не может сжать всё предложение правильно и начинает предсказывать нули. Если у вас будут идеи по улучшению, то жду в комментариях. Мой личный фаворит Triplet loss на embedings тоже не дал выдающегося результата. Думаю, что он раскроет свой потенциал на моделях в 100 раз больше по размеру и с обучением в течении нескольких месяцев. Два метода: BOW с леммами без стоп слов и среднее с весами tf-idf хоть и не дают выдающихся средних результатов, но для некоторых предложений дают очень и очень хороший результат. Поэтому, для этих методов, всё должно зависеть от данных. Вероятно, что со временем будет и Часть 3, если наберу достаточное количество идей. =========== Источник: habr.com =========== Похожие новости:
|
|
Вы не можете начинать темы
Вы не можете отвечать на сообщения
Вы не можете редактировать свои сообщения
Вы не можете удалять свои сообщения
Вы не можете голосовать в опросах
Вы не можете прикреплять файлы к сообщениям
Вы не можете скачивать файлы
Вы не можете отвечать на сообщения
Вы не можете редактировать свои сообщения
Вы не можете удалять свои сообщения
Вы не можете голосовать в опросах
Вы не можете прикреплять файлы к сообщениям
Вы не можете скачивать файлы
Текущее время: 22-Ноя 15:11
Часовой пояс: UTC + 5