import gensim
sentences = [
"He is a boy", "He is a man", "She is a girl"
]
sentences = [s.lower().strip().split(" ") for s in sentences]
#----------------------------------------
# Tagging Sentences
# 하나의 paragraph에 대해서 하나의 sentence로 설정해줌.
tagged_documents = []
for i, s in enumerate(sentences):
tagged_documents.append(
gensim.models.doc2vec.TaggedDocument(s, [i])
)
Doc2Vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1)
#----------------------------------------
# # BUILD word2vec vocab
# Doc2vec은 word2vec에 근거함.
Doc2Vec_model.build_vocab(tagged_documents)
#----------------------------------------
# Train tagged_documents
Doc2Vec_model.train(tagged_documents, total_examples=len(tagged_documents), epochs=300)
#----------------------------------------
# Infer new document vector
# not string, use list of string as input
print("== Document vector")
new_document = "he is a man"
new_doc_vector = Doc2Vec_model.infer_vector(new_document.lower().split(" "))
print(f"Document, '{new_document}' to vector {new_doc_vector[:5]}")
#----------------------------------------
# Use wor2vec similarity
# Document 전체에 대해서 similarity를 측정하여, 가장 가까운 word-vector를 사용해서 결과를 리턴.
print("== word similarity")
print(Doc2Vec_model.wv.similar_by_vector(new_doc_vector))
#----------------------------------------
# Use Doc2vec similarity
# docvec.most_similar는 word에 대한 vector에 기반해서 처리됨.
print("== document similarity")
doc_sim_lst = Doc2Vec_model.docvecs.most_similar(positive=[new_doc_vector], topn=len(Doc2Vec_model.docvecs))
for doc_id, sim in doc_sim_lst:
print(f"Document {doc_id} - similarity: {sim:.5f}")
print("== complete")
== Document vector
Document, 'he is a man' to vector [-0.01550222 0.00654157 0.00314305 0.02054347 0.00220825]
== word similarity
[('is', 0.25534892082214355), ('man', 0.16589590907096863), ('a', 0.15215086936950684), ('girl', 0.11516174674034119), ('he', 0.09550413489341736), ('she', 0.012047693133354187), ('boy', -0.13721874356269836)]
[('she', 0.22538770735263824), ('man', 0.13674762845039368), ('is', 0.0875273197889328), ('a', 0.043554868549108505), ('boy', -0.0036111846566200256), ('girl', -0.012933444231748581), ('he', -0.2287140041589737)]
== document similarity
Document 1 - similarity: 0.70888
Document 2 - similarity: 0.68156
Document 0 - similarity: 0.60137
== complete
댓글남기기