def basic_filtering_for_series(input_series):
r_s = input_series.fillna("").apply(lambda s: s.strip().lower())
r_s = r_s.apply(lambda s: s.split(";"))
# remove special char and remove space
def change_word(in_w):
r_c = []
for c in in_w:
if 'a' <= c <= 'z' or '0' <= c <='9' or c==' ':
r_c.append(c)
return "".join(r_c).strip()
r_s = r_s.apply(lambda ks: list(filter(lambda k: True if k!="" else False, map(change_word, ks))))
return r_s
auth_col = basic_filtering_for_series(df['Author Keywords'])
index_col = basic_filtering_for_series(df['Index Keywords'])
auth_index_df = pd.DataFrame({"auth_kwd":auth_col, "index_kwd":index_col})
biG = nx.Graph()
edge_lst = []
for i in range(0, len(auth_index_df)):
auth_l = list(auth_index_df['auth_kwd'].iloc()[i])
index_l = list(auth_index_df['index_kwd'].iloc()[i])
if len(auth_l) != 0 and len(index_l)!=0:
edge_lst += [ (auth, ind+"(i)") for auth in auth_l for ind in index_l ]
# add node
for e in collections.Counter(edge_lst).most_common():
auth_node, ind_node = e[0][0], e[0][1]
e_weight = e[1]
for node in [auth_node, ind_node]:
if node in biG.nodes():
biG.nodes(data=True)[node]['weight'] = biG.nodes(data=True)[node]['weight'] + e_weight
else:
biG.add_nodes_from([(node, {'weight':e_weight})])
# add edges
biG.add_edges_from(
(e[0][0], e[0][1], {'weight':e[1]}) for e in collections.Counter(edge_lst).most_common()
)
print("complete")
rawBiG = biG.copy()
biG = rawBiG.copy()
print("is bipartite?: {}".format(nx.is_bipartite(biG)))
#left, right = nx.bipartite.sets(biG)
# drop thw low nodes
print("before filtering node size: {}".format(len(biG.nodes())))
for n in biG.copy().nodes(data=True):
if n[1]['weight'] < 100: # 쓸데없는 node들을 삭제합니다.
biG.remove_node(n[0])
print("after filtering node size: {}".format(len(biG.nodes())))
nodesetA, nodesetB = nx.bipartite.sets(biG)
try:
# need row order,
biadj_matrix = nx.algorithms.bipartite.biadjacency_matrix(biG, row_order=nodesetA)
except:
print("not yet")
print()
bi_df = pd.DataFrame(biadj_matrix.toarray(), index=nodesetA, columns=nodesetB)
bi_df = bi_df.apply(lambda col: (col - 0)/(col.max() - col.min()))# scaling by column
n_n_dist_lst = [
( bi_df.index[i], bi_df.index[j], euclidean(bi_df.iloc()[j], bi_df.iloc()[i]) )
for i in range(0, len(bi_df)-1) for j in range(i+1, len(bi_df))
]
for nn in sorted(n_n_dist_lst, key=lambda x: x[2], reverse=True)[:10]:
print(nn)
is bipartite?: True
before filtering node size: 38369
after filtering node size: 791
('smes', 'shape memory effect', 15.436279758020806)
('shape memory alloys', 'smes', 15.313835112256491)
('smes', 'apoptosis', 15.297551958555223)
('antioxidant', 'smes', 15.297110857498893)
('smes', 'martensitic transformation', 15.28047904820015)
('biodiesel', 'smes', 15.252823023508066)
('platinum', 'smes', 15.250537382105268)
('smes', 'crystal structure', 15.24602190003883)
('xray diffraction', 'smes', 15.228784583558754)
('oxidative stress', 'smes', 15.22873389330548)
댓글남기기