main.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #Importing required module
  2. import numpy as np
  3. from nltk.tokenize import word_tokenize
  4. import nltk
  5. nltk.download('punkt')
  6. from creatdata import twowords
  7. from creatdata import stopwords
  8. import time
  9. import json
  10. from Suggest_news import news
  11. start=time.time()
  12. #print(len(twowords))
  13. #------------------------------------some functions---------------------------------------------------------
  14. def count_dict(sentences):
  15. word_count = {}
  16. for word in word_set:
  17. word_count[word] = 0
  18. for sent in sentences:
  19. if word in sent:
  20. word_count[word] += 1
  21. return word_count
  22. #Term Frequency
  23. def termfreq(document, word):
  24. N = len(document)
  25. occurance = len([token for token in document if token == word])
  26. return occurance/N
  27. #Inverse Document Frequency
  28. def inverse_doc_freq(word):
  29. try:
  30. word_occurance = word_count[word] + 1
  31. except:
  32. word_occurance = 1
  33. return np.log(total_documents/word_occurance)
  34. #TF-IDF
  35. def tf_idf(sentence):
  36. tf_idf_vec = np.zeros((len(word_set),))
  37. for word in sentence:
  38. if word in word_set:
  39. tf = termfreq(sentence,word)
  40. idf = inverse_doc_freq(word)
  41. value = tf*idf
  42. tf_idf_vec[index_dict[word]] = value
  43. return tf_idf_vec
  44. #Cosine similarity
  45. def cos_cal(vec_a,vec_b):
  46. cos_sim = np.inner(vec_a, vec_b)/(np.linalg.norm(vec_a)* np.linalg.norm(vec_b))
  47. return cos_sim
  48. #jaccard similarity
  49. def jaccard_similarity(A, B):
  50. tmp=0
  51. for i in range(len(A)):
  52. if A[i]==B[i]: tmp+=1
  53. return tmp/len(A)
  54. #*********************************************************************************************
  55. #Preprocessing the text data
  56. sentences = []
  57. word_set = []
  58. with open("baothethao2.txt") as f:
  59. contents = f.readlines()
  60. #print(contents)
  61. for sent in contents:
  62. tmp=""
  63. check=False
  64. x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
  65. sentences.append(x)
  66. check2=False
  67. for word in x:
  68. if check:
  69. check=False
  70. continue
  71. tmp2=tmp+word
  72. tmp = word + ' '
  73. if (tmp2 in twowords) and (tmp2 not in word_set):
  74. if check2: word_set.pop()
  75. word_set.append(tmp2)
  76. check = True
  77. check2=False
  78. elif (word in word_set) or (word in stopwords): continue
  79. else:
  80. word_set.append(word)
  81. check2=True
  82. with open('result.txt', 'w', encoding='utf8') as json_file:
  83. json.dump(word_set, json_file, ensure_ascii=False)
  84. json_file.close()
  85. #print(len(word_set))
  86. #Set of vocab
  87. word_set = set(word_set)
  88. #Total documents in our corpus
  89. total_documents = len(sentences)
  90. #Creating an index for each word in our vocab.
  91. index_dict = {} #Dictionary to store index for each word
  92. i = 0
  93. for word in word_set:
  94. index_dict[word] = i
  95. i += 1
  96. #Creating word_count
  97. word_count = count_dict(sentences)
  98. with open('count_dict.txt', 'w', encoding='utf8') as json_file:
  99. json.dump(word_count, json_file, ensure_ascii=False)
  100. json_file.close()
  101. #TF-IDF Encoded text corpus
  102. vectors = []
  103. for sent in sentences:
  104. vec = tf_idf(sent)
  105. vectors.append(vec)
  106. #Creat user-vector
  107. tf_words = {}
  108. vec_user = []
  109. total_words = 0
  110. for sent in sentences:
  111. total_words += len(sent)
  112. avg=0
  113. jac_user_vec=[]
  114. for word in word_set:
  115. tf_words[word] = 0
  116. for sent in sentences:
  117. if word in sent:
  118. for tmp_word in sent:
  119. if (word==tmp_word):
  120. tf_words[word] += 1/total_words
  121. avg+=tf_words[word]/len(word_set)
  122. set(word_set)
  123. with open('tf_word.txt', 'w', encoding='utf8') as json_file:
  124. json.dump(tf_words, json_file, ensure_ascii=False)
  125. json_file.close()
  126. for x,y in tf_words.items():
  127. if y>=avg: jac_user_vec.append(1)
  128. else: jac_user_vec.append(0)
  129. vec_user.append(y)
  130. #print(vec_user)
  131. #*********************************************************************************************
  132. #Testing
  133. '''with open("testingrss.txt") as fii:
  134. thethao = fii.readlines()'''
  135. res_cos={}
  136. link_title={}
  137. for i in range(len(news)):
  138. sent=news[i]['content']
  139. x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
  140. sentences.append(x)
  141. t_vec=tf_idf(x)
  142. jac_t_vec=[]
  143. avg=0
  144. for value in t_vec:
  145. avg+=value/len(t_vec)
  146. for x in t_vec:
  147. if x>=avg: jac_t_vec.append(1)
  148. else: jac_t_vec.append(0)
  149. A = jac_user_vec
  150. B = jac_t_vec
  151. val=cos_cal(t_vec,vec_user)
  152. res_cos[news[i]['title']]=val
  153. link_title[news[i]['title']]=news[i]['link']
  154. sentences.pop()
  155. res_cos={k: v for k, v in sorted(res_cos.items(), key=lambda item: item[1], reverse=True)}
  156. #res_jac={k: v for k, v in sorted(res_jac.items(), key=lambda item: item[1], reverse=True)}
  157. '''
  158. for key in res_cos:
  159. print(key, ' : ', res_cos[key],'\n')
  160. print('----------------------- \n')
  161. """
  162. for key in res_jac:
  163. print(key, ' : ', res_jac[key],'\n')
  164. """
  165. print("--- %s seconds ---" % (time.time() - start))
  166. '''
  167. print(link_title)