_ndtai_
/
CyberItellect_Mos


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
							#Importing required module
import numpy as np
from nltk.tokenize import  word_tokenize
import nltk
nltk.download('punkt')
from creatdata import twowords
from creatdata import stopwords
import time
import json
from Suggest_news import news


start=time.time()


#print(len(twowords))

#------------------------------------some functions---------------------------------------------------------
def count_dict(sentences):
    word_count = {}
    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count

#Term Frequency
def termfreq(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance/N

#Inverse Document Frequency
def inverse_doc_freq(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_documents/word_occurance)

#TF-IDF
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
      if word in word_set:
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word)
        value = tf*idf
        tf_idf_vec[index_dict[word]] = value
    return tf_idf_vec

#Cosine similarity
def cos_cal(vec_a,vec_b):
  cos_sim = np.inner(vec_a, vec_b)/(np.linalg.norm(vec_a)* np.linalg.norm(vec_b))
  return cos_sim

#jaccard similarity
def jaccard_similarity(A, B):

    tmp=0
    for i in range(len(A)):
        if A[i]==B[i]: tmp+=1

    return tmp/len(A)


#*********************************************************************************************

#Preprocessing the text data
sentences = []
word_set = []

with open("baothethao2.txt") as f:
  contents = f.readlines()

#print(contents)

for sent in contents:
    tmp=""
    check=False
    x = [i.lower() for  i in word_tokenize(sent) if i.isalpha()]
    sentences.append(x)
    check2=False
    for word in x:
        if check:
            check=False
            continue
        tmp2=tmp+word
        tmp = word + ' '
        if (tmp2 in twowords) and (tmp2 not in word_set):
            if check2: word_set.pop()
            word_set.append(tmp2)
            check = True
            check2=False
        elif (word in word_set) or (word in stopwords): continue
        else:
            word_set.append(word)
            check2=True
with open('result.txt', 'w', encoding='utf8') as json_file:
    json.dump(word_set, json_file, ensure_ascii=False)
json_file.close()
#print(len(word_set))

#Set of vocab
word_set = set(word_set)
#Total documents in our corpus
total_documents = len(sentences)

#Creating an index for each word in our vocab.
index_dict = {} #Dictionary to store index for each word
i = 0
for word in word_set:
    index_dict[word] = i
    i += 1
#Creating word_count
word_count = count_dict(sentences)
with open('count_dict.txt', 'w', encoding='utf8') as json_file:
    json.dump(word_count, json_file, ensure_ascii=False)
json_file.close()

#TF-IDF Encoded text corpus
vectors = []
for sent in sentences:
    vec = tf_idf(sent)
    vectors.append(vec)

#Creat user-vector
tf_words = {}
vec_user = []
total_words = 0
for sent in sentences:
    total_words += len(sent)

avg=0
jac_user_vec=[]

for word in word_set:
  tf_words[word] = 0
  for sent in sentences:
    if word in sent:
      for tmp_word in sent:
        if (word==tmp_word):
            tf_words[word] += 1/total_words

  avg+=tf_words[word]/len(word_set)

set(word_set)

with open('tf_word.txt', 'w', encoding='utf8') as json_file:
    json.dump(tf_words, json_file, ensure_ascii=False)
json_file.close()

for x,y in tf_words.items():
  if y>=avg: jac_user_vec.append(1)
  else: jac_user_vec.append(0)
  vec_user.append(y)

#print(vec_user)

#*********************************************************************************************
#Testing

'''with open("testingrss.txt") as fii:
  thethao = fii.readlines()'''

res_cos={}
link_title={}

for i in range(len(news)):
    sent=news[i]['content']
    x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
    sentences.append(x)

    t_vec=tf_idf(x)

    jac_t_vec=[]
    avg=0
    for value in t_vec:
        avg+=value/len(t_vec)
    for x in t_vec:
        if x>=avg: jac_t_vec.append(1)
        else: jac_t_vec.append(0)


    A = jac_user_vec
    B = jac_t_vec
    val=cos_cal(t_vec,vec_user)
    res_cos[news[i]['title']]=val
    link_title[news[i]['title']]=news[i]['link']
    sentences.pop()

res_cos={k: v for k, v in sorted(res_cos.items(), key=lambda item: item[1], reverse=True)}
#res_jac={k: v for k, v in sorted(res_jac.items(), key=lambda item: item[1], reverse=True)}
'''
for key in res_cos:
    print(key, ' : ', res_cos[key],'\n')

print('----------------------- \n')

"""
for key in res_jac:
    print(key, ' : ', res_jac[key],'\n')
"""

print("--- %s seconds ---" % (time.time() - start))
'''
print(link_title)