3 years ago · 270378b817
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
 
				+.idea
			
 
				+transformers
			
 
				+pygubu
			
 
				+pygologin
			
 
				+gologincrawl
			
 
				+Crawl_Baomoi/source
			
 
				+Crawl_Baomoi/baomoi2.txt
			
 
				+Crawl_Baomoi/baomoi2_tmp.txt
			
--- a/.inputrc
+++ b/.inputrc
@@ -0,0 +1,3 @@
 
				+set completion-ignore-case on
			
 
				+set show-all-if-ambiguous on
			
 
				+TAB: menu-complete
			
--- a/Crawl_Baomoi/baomoi_testing_crawling.txt
+++ b/Crawl_Baomoi/baomoi_testing_crawling.txt
--- a/Crawl_Baomoi/crawl_from_Baomoi.py
+++ b/Crawl_Baomoi/crawl_from_Baomoi.py
@@ -0,0 +1,65 @@
 
				+import sys
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+import time
			
 
				+import json
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+fi=open("links2.txt","r")
			
 
				+fo=open("","a")
			
 
				+
			
 
				+news={}
			
 
				+#news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}
			
 
				+
			
 
				+i=0
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+for line in fi.readlines():
			
 
				+    i+=1
			
 
				+    if (i<69878): continue
			
 
				+    link=line.strip()
			
 
				+    try:
			
 
				+        url=requests.get(link)
			
 
				+        if url.history: continue
			
 
				+        t_soup = BeautifulSoup(url.text, 'lxml')
			
 
				+        t_content=""
			
 
				+        for title in t_soup.findAll('h1', {'class': 'bm_J'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('h3', {'class': 'bm_Ak bm_J'}):
			
 
				+            t_description = description.text
			
 
				+        for date in t_soup.findAll('time'):
			
 
				+            if date.has_attr('datetime'):
			
 
				+                t_date=date['datetime']
			
 
				+        for category in t_soup.findAll('a', {'class': 'bm_y'}):
			
 
				+            t_category=category.text
			
 
				+        for content in t_soup.findAll('p', {'class': 'bm_Y'}):
			
 
				+            t_content+=content.text+" "
			
 
				+        for content in t_soup.findAll('p', {'class': 'bm_Y bm_FP'}):
			
 
				+            t_content+=content.text+" "
			
 
				+        news = {'title': t_title, 'description': t_description, 'content': t_content, 'category': t_category, 'date': t_date}
			
 
				+        fo.write(json.dumps(news, ensure_ascii=False))
			
 
				+        fo.write('\n')
			
 
				+        print(i)
			
 
				+    except:
			
 
				+        print("Error!")
			
 
				+
			
 
				+fi.close()
			
 
				+fo.close()
			
 
				+
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/Crawl_Baomoi/crawl_from_others.py
+++ b/Crawl_Baomoi/crawl_from_others.py
@@ -0,0 +1,226 @@
 
				+import sys
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from pygologin.gologin import GoLogin
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.common.exceptions import NoSuchElementException
			
 
				+from selenium.common.exceptions import StaleElementReferenceException
			
 
				+import json
			
 
				+
			
 
				+from underthesea import ner
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+from datetime import datetime
			
 
				+from datetime import timedelta
			
 
				+
			
 
				+gl = GoLogin({
			
 
				+    "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzJjMjliMjJlMjIxZjVlMjc5Yzc4ZTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzJjMmI3OTlmYjIxNDI0YTFmNTQzZTUifQ.GR4iJFqUVRuI3XO_Ns3cfiII2m8CactTGU9jhNaSf-k",
			
 
				+    "profile_id": "632c5184cef566f424ef2e3c",
			
 
				+    # "port": random_port
			
 
				+})
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+debugger_address = gl.start()
			
 
				+chrome_options = Options()
			
 
				+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+# ----------------------------
			
 
				+
			
 
				+
			
 
				+'''
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
			
 
				+chrome_options = Options()
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+'''
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+# ----------some def------------------
			
 
				+def find_first_link():
			
 
				+    for tmp in driver.find_elements(By.TAG_NAME,'a'):
			
 
				+        extracted_link=tmp.get_attribute("href")
			
 
				+        if (extracted_link!=None):
			
 
				+            if (extracted_link.find("https://"+site+"/")==0):
			
 
				+                print(extracted_link)
			
 
				+
			
 
				+def create_link(site):
			
 
				+    link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
			
 
				+    date_from=date-timedelta(days=1)
			
 
				+    date_to=date+timedelta(days=1)
			
 
				+    year_from=date_from.strftime("%Y")
			
 
				+    year_to=date_to.strftime("%Y")
			
 
				+    month_from=date_from.strftime("%m")
			
 
				+    month_to=date_to.strftime("%m")
			
 
				+    day_from=date_from.strftime("%d")
			
 
				+    day_to=date_to.strftime("%d")
			
 
				+    tmp = 'min%3A'+month_from+'%2F'+day_from+'%2F'+year_from+'%2Ccd_max%3A'+month_to+ '%2F'+ day_to+  '%2F'+ year_to+  '&tbm='
			
 
				+    return link+tmp
			
 
				+
			
 
				+def crawl(link,site):
			
 
				+
			
 
				+    news = {}
			
 
				+    t_title = ""
			
 
				+    t_description = ""
			
 
				+    t_contents = ''
			
 
				+    url = requests.get(link)
			
 
				+    t_soup = BeautifulSoup(url.text, 'lxml')
			
 
				+
			
 
				+    if (site=="thanhnien.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('div', {'class': 'cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+    
			
 
				+    if (site=="vnexpress.net"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'title-detail'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('p', {'class': 'description'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('p', {'class': 'Normal'}):
			
 
				+            t_contents+=contents.text+". "
			
 
				+
			
 
				+    if (site=="tienphong.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('td', {'class': 'caption'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+
			
 
				+    if (site=="vov.vn"):
			
 
				+        for title in t_soup.findAll('div', {'class': 'row article-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'row article-summary'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'row article-content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('td'):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+
			
 
				+    if (site=="nhandan.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+        for contents in t_soup.findAll('td', {'class': 'caption'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site=="zingnews.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site=="tuoitre.vn"):
			
 
				+
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'content fck'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+
			
 
				+    news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
			
 
				+    if t_title=="": return {}
			
 
				+    return news
			
 
				+
			
 
				+#-----------------------
			
 
				+sites={'vnexpress.net','thanhnien.vn','tienphong.vn','vov.vn','nhandan.vn','zingnews.vn','tuoitre.vn'}
			
 
				+
			
 
				+fi=open("baomoi_testing_crawling.txt","r")
			
 
				+fo=open("testing.txt",'w')
			
 
				+i=0
			
 
				+#--------------------------
			
 
				+for line in fi.readlines():
			
 
				+    a=json.loads(line)
			
 
				+    t_str=ner(a["title"])
			
 
				+    #---
			
 
				+    t_date=a["date"]
			
 
				+    year=int(t_date[0:4])
			
 
				+    month=int(t_date[5:7])
			
 
				+    day=int(t_date[8:10])
			
 
				+    date=datetime(year,month,day)
			
 
				+    #---
			
 
				+    searching_key= ''
			
 
				+    for words in t_str:
			
 
				+        if (words[1]=="N") or (words[1]=="Np"):
			
 
				+            searching_key+= '"' + words[0] + '"' + "%2B"
			
 
				+    searching_key=searching_key.replace(" ", "+")
			
 
				+    searching_key= searching_key[0:len(searching_key) - 3]
			
 
				+    source = [line]
			
 
				+    for site in sites:
			
 
				+        #print(create_link(site))
			
 
				+        check_link=create_link(site)
			
 
				+        driver.get(check_link)
			
 
				+        time.sleep(0.5)
			
 
				+        #print(check_link)
			
 
				+        #print(driver.current_url)
			
 
				+        while (driver.current_url.find("https://www.google.com/search")==-1):
			
 
				+            driver.delete_all_cookies()
			
 
				+            driver.refresh()
			
 
				+            driver.get(check_link)
			
 
				+            time.sleep(0.5)
			
 
				+        #print("#---------------------"+"\n")
			
 
				+        '''
			
 
				+        while (driver.current_url!=check_link):
			
 
				+            driver.delete_all_cookies()
			
 
				+            driver.refresh()
			
 
				+            driver.get(check_link)
			
 
				+        '''
			
 
				+        #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
			
 
				+        for tmp in driver.find_elements(By.TAG_NAME, 'a'):
			
 
				+            extracted_link = tmp.get_attribute("href")
			
 
				+            if (extracted_link != None):
			
 
				+                if (extracted_link.find("https://" + site + "/") == 0):
			
 
				+                    #print(extracted_link)
			
 
				+                    print(extracted_link)
			
 
				+                    news = crawl(extracted_link,site)
			
 
				+                    if news!={}:
			
 
				+                        source.append(news)
			
 
				+                    break
			
 
				+        #time.sleep(2)
			
 
				+        #crawling(create_link(site))
			
 
				+        # print(source)
			
 
				+
			
 
				+    fo.write(json.dumps(source, ensure_ascii=False))
			
 
				+    fo.write('\n')
			
 
				+
			
 
				+
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/Crawl_Baomoi/crawl_from_others_tesing_google.py
+++ b/Crawl_Baomoi/crawl_from_others_tesing_google.py
@@ -0,0 +1,423 @@
 
				+from sys import platform
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from pygologin.gologin import GoLogin
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.common.exceptions import NoSuchElementException
			
 
				+from selenium.common.exceptions import StaleElementReferenceException
			
 
				+import json
			
 
				+
			
 
				+from underthesea import ner
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+from datetime import datetime
			
 
				+from datetime import timedelta
			
 
				+
			
 
				+proxy_list=[]
			
 
				+for line in open("proxylist.txt","r"):
			
 
				+    proxy_list.append(line.split(":"))
			
 
				+proxy_check=[]
			
 
				+
			
 
				+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzU4ZGExYTMyMzA4NDUzNDYwYjMwOTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzU4ZGEzNDM5OGJmNTFkM2IyMjc5OTQifQ.8LBET_Bp0BK7W7nCafDQD1BV3nKkmKIXA7iltU0z0VA"
			
 
				+
			
 
				+gl = GoLogin({
			
 
				+	"token": TOKEN,
			
 
				+    'tmpdir':"/tmp/",
			
 
				+    "local":True,
			
 
				+    "credentials_enable_service": False,
			
 
				+})
			
 
				+
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+debugger_address = gl.start()
			
 
				+chrome_options = Options()
			
 
				+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+profile_id = gl.create({
			
 
				+    "name": 'profile_1',
			
 
				+    "os": 'mac',
			
 
				+    "proxyEnabled": True,
			
 
				+    "navigator": {
			
 
				+        "language": 'en-US,en;q=0.9,he;q=0.8',
			
 
				+        "userAgent": 'MyUserAgent',
			
 
				+        "resolution": '1024x768',
			
 
				+        "platform": 'darwin',
			
 
				+    },
			
 
				+    "proxy":{
			
 
				+        'mode': 'http',
			
 
				+        'host': host,
			
 
				+        'port': port,
			
 
				+        'username': "prateep6793",
			
 
				+        'password': "Zing1234",
			
 
				+    }
			
 
				+});
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+def creat_new_profile_id(gl, i):
			
 
				+    host=proxy_list[i][0]
			
 
				+    port=proxy_list[i][1]
			
 
				+    profile_id = gl.create({
			
 
				+        "name": 'profile_1',
			
 
				+        "os": 'mac',
			
 
				+        "proxyEnabled": True,
			
 
				+        "navigator": {
			
 
				+            "language": 'en-US,en;q=0.9,he;q=0.8',
			
 
				+            "userAgent": 'MyUserAgent',
			
 
				+            "resolution": '1024x768',
			
 
				+            "platform": 'darwin',
			
 
				+        },
			
 
				+        "proxy":{
			
 
				+            'mode': 'http',
			
 
				+            'host': host,
			
 
				+            'port': port,
			
 
				+            'username': "prateep6793",
			
 
				+            'password': "Zing1234",
			
 
				+        }
			
 
				+    });
			
 
				+    return profile_id
			
 
				+
			
 
				+def clear_proxy_list(gl,driver):
			
 
				+    i=0
			
 
				+    while (i<len(proxy_list)):
			
 
				+        try:
			
 
				+            profile_id=creat_new_profile_id(gl,i)
			
 
				+            gl = GoLogin({
			
 
				+                "token": TOKEN,
			
 
				+                'profile_id': profile_id,
			
 
				+            })
			
 
				+            chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+            debugger_address = gl.start()
			
 
				+            chrome_options = Options()
			
 
				+            chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+            driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+            proxy_check[i]=True
			
 
				+        except:
			
 
				+            print("Error Proxy!")
			
 
				+            proxy_check[i]=False
			
 
				+        i+=1
			
 
				+
			
 
				+
			
 
				+
			
 
				+# ----------------------------
			
 
				+
			
 
				+
			
 
				+'''
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
			
 
				+chrome_options = Options()
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+'''
			
 
				+
			
 
				+start=time.time()
			
 
				+print("ok")
			
 
				+# ----------some def------------------
			
 
				+def find_first_link():
			
 
				+    for tmp in driver.find_elements(By.TAG_NAME,'a'):
			
 
				+        extracted_link=tmp.get_attribute("href")
			
 
				+        if (extracted_link!=None):
			
 
				+            if (extracted_link.find("https://"+site+"/")==0):
			
 
				+                print(extracted_link)
			
 
				+
			
 
				+def create_link(site):
			
 
				+    link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
			
 
				+    date_from=date-timedelta(days=1)
			
 
				+    date_to=date+timedelta(days=1)
			
 
				+    year_from=date_from.strftime("%Y")
			
 
				+    year_to=date_to.strftime("%Y")
			
 
				+    month_from=date_from.strftime("%m")
			
 
				+    month_to=date_to.strftime("%m")
			
 
				+    day_from=date_from.strftime("%d")
			
 
				+    day_to=date_to.strftime("%d")
			
 
				+    tmp = "&tbs=cdr:1,cd_min:"+month_to+"/"+day_to+"/"+year_to+",cd_max:"+month_from+"/"+day_from+"/"+year_from
			
 
				+    #print(link+tmp)
			
 
				+    return link+tmp
			
 
				+
			
 
				+def crawl(link,site):
			
 
				+
			
 
				+    news = {}
			
 
				+    t_title = ""
			
 
				+    t_description = ""
			
 
				+    t_contents = ''
			
 
				+    url = requests.get(link)
			
 
				+    t_soup = BeautifulSoup(url.text, 'lxml')
			
 
				+
			
 
				+    if (site=="thanhnien.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('div', {'class': 'cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+
			
 
				+    if (site=="vnexpress.net"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'title-detail'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('p', {'class': 'description'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('p', {'class': 'Normal'}):
			
 
				+            t_contents+=contents.text+". "
			
 
				+
			
 
				+    if (site=="tienphong.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('td', {'class': 'caption'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+
			
 
				+    if (site=="vov.vn"):
			
 
				+        for title in t_soup.findAll('div', {'class': 'row article-title'}):
			
 
				+            t_title=title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'row article-summary'}):
			
 
				+            t_description=description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'row article-content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+        for contents in t_soup.findAll('td'):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents+=content.text+". "
			
 
				+
			
 
				+    if (site=="nhandan.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+        for contents in t_soup.findAll('td', {'class': 'caption'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site=="zingnews.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site=="tuoitre.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'content fck'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site=="cand.com.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'box-title-detail entry-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'box-des-detail this-one'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'detail-content-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "vtv.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'title_detail'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'ta-justify'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+                tmp = len(content.text + ". ")
			
 
				+
			
 
				+    if (site == "24h.com.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'clrTit bld tuht_show'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'ctTp tuht_show'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('article', {'class': 'nwsHt nwsUpgrade'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "dantri.com.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'title-page detail'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'singular-sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'singular-content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "baophapluat.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+        for contents in t_soup.findAll('td', {'class': 'caption'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "kenh14.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'kbwc-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'knc-sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'knc-content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "laodong.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'chappeau'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'art-body'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "qdnd.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'post-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('div', {'class': 'post-summary'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'post-content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "vtc.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'font28 bold lh-1-3'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'font18 bold inline-nb'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'class': 'edittor-content box-cont mt15 clearfix '}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+    if (site == "toquoc.vn"):
			
 
				+        for title in t_soup.findAll('h1', {'class': 'entry-title'}):
			
 
				+            t_title = title.text
			
 
				+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
			
 
				+            t_description = description.text
			
 
				+        for contents in t_soup.findAll('div', {'data-role': 'content'}):
			
 
				+            for content in contents.findAll('p'):
			
 
				+                t_contents += content.text + ". "
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
			
 
				+    if t_title=="": return {}
			
 
				+    return news
			
 
				+
			
 
				+#-----------------------
			
 
				+sites={'vnexpress.net','thanhnien.vn','tienphong.vn',
			
 
				+       'vov.vn','nhandan.vn','zingnews.vn',
			
 
				+       'tuoitre.vn','cand.com.vn','vtv.vn',
			
 
				+       '24h.com.vn','dantri.com.vn','baophapluat.vn',
			
 
				+       'kenh14.vn','laodong.vn','qdnd.vn','vtc.vn',
			
 
				+       'toquoc.vn'}
			
 
				+
			
 
				+fi=open("baomoi_testing_crawling.txt","r")
			
 
				+fo=open("testing.txt",'w')
			
 
				+#--------------------------
			
 
				+
			
 
				+clear_proxy_list()
			
 
				+
			
 
				+for line in fi.readlines():
			
 
				+    a=json.loads(line)
			
 
				+    t_str=ner(a["title"])
			
 
				+    #---
			
 
				+    t_date=a["date"]
			
 
				+    year=int(t_date[0:4])
			
 
				+    month=int(t_date[5:7])
			
 
				+    day=int(t_date[8:10])
			
 
				+    date=datetime(year,month,day)
			
 
				+    #---
			
 
				+    searching_key= ''
			
 
				+    for words in t_str:
			
 
				+        if (words[1]=="N") or (words[1]=="Np"):
			
 
				+            searching_key+= '"' + words[0] + '"' + "%2B"
			
 
				+    searching_key=searching_key.replace(" ", "+")
			
 
				+    searching_key= searching_key[0:len(searching_key) - 3]
			
 
				+    source = [line]
			
 
				+    for site in sites:
			
 
				+        #print(create_link(site))
			
 
				+        check_link=create_link(site)
			
 
				+        driver.get(check_link)
			
 
				+        time.sleep(1)
			
 
				+        #print(check_link)
			
 
				+        if (driver.current_url.find("sorry")!=-1):
			
 
				+            print("Error!")
			
 
				+        else:
			
 
				+            print(driver.current_url)
			
 
				+        """
			
 
				+            try:
			
 
				+                i+=1
			
 
				+                driver.close()
			
 
				+                gl = GoLogin({
			
 
				+                    "token": TOKEN,
			
 
				+                    'profile_id': creat_new_profile_id(gl,i),
			
 
				+                })
			
 
				+                debugger_address = gl.start()
			
 
				+                chrome_options = Options()
			
 
				+                chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+                driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+
			
 
				+                driver.get(check_link)
			
 
				+            except:
			
 
				+                pass
			
 
				+
			
 
				+        #print("#---------------------"+"\n")
			
 
				+        '''
			
 
				+        while (driver.current_url!=check_link):
			
 
				+            driver.delete_all_cookies()
			
 
				+            driver.refresh()
			
 
				+            driver.get(check_link)
			
 
				+        '''
			
 
				+        """
			
 
				+        #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
			
 
				+        for tmp in driver.find_elements(By.TAG_NAME, 'a'):
			
 
				+            extracted_link = tmp.get_attribute("href")
			
 
				+            if (extracted_link != None):
			
 
				+                if (extracted_link.find("https://" + site + "/") == 0):
			
 
				+                    #print(extracted_link)
			
 
				+                    print(extracted_link)
			
 
				+                    news = crawl(extracted_link,site)
			
 
				+                    if news!={}:
			
 
				+                        source.append(news)
			
 
				+                    break
			
 
				+        #time.sleep(2)
			
 
				+        #crawling(create_link(site))
			
 
				+        # print(source)
			
 
				+
			
 
				+    fo.write(json.dumps(source, ensure_ascii=False))
			
 
				+    fo.write('\n')
			
 
				+
			
 
				+
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/Crawl_Baomoi/crawl_links.py
+++ b/Crawl_Baomoi/crawl_links.py
@@ -0,0 +1,90 @@
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+import bs4
			
 
				+import requests
			
 
				+from urllib.parse import urljoin
			
 
				+import time
			
 
				+
			
 
				+base = 'https://baomoi.com/'
			
 
				+
			
 
				+#start=time.time()
			
 
				+
			
 
				+f=open("links2.txt","w")
			
 
				+
			
 
				+
			
 
				+
			
 
				+def scraping_link(link):
			
 
				+    url = requests.get(link)
			
 
				+    soup = BeautifulSoup(url.content, 'lxml')
			
 
				+    for links in soup.findAll('div',{'class':'bm_O'}):
			
 
				+        for a in links.findAll('a', href=True):
			
 
				+            f.write(urljoin(base,a['href']))
			
 
				+            f.write('\n')
			
 
				+
			
 
				+
			
 
				+
			
 
				+for i in range(1,168):
			
 
				+    start = time.time()
			
 
				+    print(i)
			
 
				+    #scraping_link("https://baomoi.com/tin-moi/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/the-gioi/trang+"+str(i)+".epi")
			
 
				+    #----
			
 
				+    scraping_link("https://baomoi.com/thoi-su/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/giao-thong/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/moi-truong-khi-hau/trang+"+str(i)+".epi")
			
 
				+    #----
			
 
				+    scraping_link("https://baomoi.com/nghe-thuat/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/am-thuc/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/du-lich/trang+"+str(i)+".epi")
			
 
				+    #----
			
 
				+    scraping_link("https://baomoi.com/lao-dong-viec-lam/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/tai-chinh/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/chung-khoan/trang+"+str(i)+".epi")
			
 
				+    scraping_link("https://baomoi.com/kinh-doanh/trang+"+str(i)+".epi")
			
 
				+    #-----
			
 
				+    scraping_link("https://baomoi.com/hoc-bong-du-hoc/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/dao-tao-thi-cu/trang+" + str(i) + ".epi")
			
 
				+    #-----
			
 
				+    scraping_link("https://baomoi.com/bong-da-quoc-te/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/bong-da-viet-nam/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/quan-vot/trang+" + str(i) + ".epi")
			
 
				+    #---
			
 
				+    scraping_link("https://baomoi.com/am-nhac/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/thoi-trang/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/dien-anh-truyen-hinh/trang+" + str(i) + ".epi")
			
 
				+    #---
			
 
				+    scraping_link("https://baomoi.com/an-ninh-trat-tu/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/hinh-su-dan-su/trang+" + str(i) + ".epi")
			
 
				+    # ---
			
 
				+    scraping_link("https://baomoi.com/cntt-vien-thong/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/thiet-bi-phan-cung/trang+" + str(i) + ".epi")
			
 
				+    # ---
			
 
				+    scraping_link("https://baomoi.com/khoa-hoc/trang+" + str(i) + ".epi")
			
 
				+    # ---
			
 
				+    scraping_link("https://baomoi.com/dinh-duong-lam-dep/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/tinh-yeu-hon-nhan/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/suc-khoe-y-te/trang+" + str(i) + ".epi")
			
 
				+    # ---
			
 
				+    scraping_link("https://baomoi.com/xe-co/trang+" + str(i) + ".epi")
			
 
				+    # ---
			
 
				+    scraping_link("https://baomoi.com/quan-ly-quy-hoach/trang+" + str(i) + ".epi")
			
 
				+    scraping_link("https://baomoi.com/khong-gian-kien-truc/trang+" + str(i) + ".epi")
			
 
				+    print("--- %s seconds ---" % (time.time() - start))
			
 
				+f.close()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/Crawl_Baomoi/extract_data.py
+++ b/Crawl_Baomoi/extract_data.py
@@ -0,0 +1,35 @@
 
				+from underthesea import ner
			
 
				+from underthesea import pos_tag
			
 
				+from underthesea import word_tokenize
			
 
				+import json
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+fi=open("baomoi_testing_crawling.txt","r")
			
 
				+
			
 
				+i=0
			
 
				+
			
 
				+for line in fi.readlines():
			
 
				+    a=json.loads(line)
			
 
				+    t_str=ner(a["title"])
			
 
				+    #---
			
 
				+    t_date=a["date"]
			
 
				+    year=t_date[0:4]
			
 
				+    month=t_date[5:7]
			
 
				+    day=t_date[8:10]
			
 
				+    #---
			
 
				+    print(t_str)
			
 
				+    searching_key= ''
			
 
				+    for words in t_str:
			
 
				+        if (words[1]=="N") or (words[1]=="Np"):
			
 
				+            searching_key+= '"' + words[0] + '"' + "%2B"
			
 
				+    searching_key=searching_key.replace(" ", "+")
			
 
				+    searching_key= searching_key[0:len(searching_key) - 3]
			
 
				+    '''
			
 
				+    print(searching_key)
			
 
				+    print(year,' ',month,' ',day)
			
 
				+    print(i)
			
 
				+    '''
			
 
				+    i+=1
			
 
				+    if (i==2): break
			
--- a/Crawl_Baomoi/freeproxylist.txt
+++ b/Crawl_Baomoi/freeproxylist.txt
@@ -0,0 +1,300 @@
 
				+23.238.33.186:80
			
 
				+47.254.47.61:8080
			
 
				+117.54.114.98:80
			
 
				+80.48.119.28:8080
			
 
				+117.103.163.12:8080
			
 
				+116.58.166.194:8080
			
 
				+118.26.110.48:8080
			
 
				+138.68.235.51:80
			
 
				+198.49.68.80:80
			
 
				+169.57.1.85:8123
			
 
				+51.15.242.202:8888
			
 
				+141.95.122.232:80
			
 
				+82.210.8.173:80
			
 
				+146.196.48.2:80
			
 
				+219.78.228.211:80
			
 
				+83.229.73.175:80
			
 
				+37.53.103.4:3128
			
 
				+38.242.204.153:7070
			
 
				+200.69.210.59:80
			
 
				+155.133.71.16:8080
			
 
				+37.112.29.73:55443
			
 
				+85.10.199.48:80
			
 
				+47.91.56.120:8080
			
 
				+23.137.139.61:3129
			
 
				+47.252.1.180:3128
			
 
				+161.97.126.37:8118
			
 
				+20.111.54.16:80
			
 
				+124.13.181.4:80
			
 
				+3.1.248.232:80
			
 
				+103.234.200.254:80
			
 
				+165.154.225.65:80
			
 
				+129.154.54.57:3128
			
 
				+45.79.94.19:80
			
 
				+111.251.185.158:80
			
 
				+111.250.22.44:80
			
 
				+103.172.116.231:80
			
 
				+172.105.107.25:999
			
 
				+155.138.197.162:80
			
 
				+200.103.102.18:80
			
 
				+45.229.34.174:999
			
 
				+216.137.184.253:80
			
 
				+20.210.113.32:8123
			
 
				+104.248.194.178:80
			
 
				+116.203.199.47:8080
			
 
				+165.232.149.87:8888
			
 
				+201.91.18.82:8000
			
 
				+185.15.172.212:3128
			
 
				+143.198.77.180:80
			
 
				+177.82.85.209:3128
			
 
				+47.241.165.133:443
			
 
				+118.70.186.173:4007
			
 
				+181.49.100.190:8080
			
 
				+185.237.99.218:61443
			
 
				+104.225.220.233:80
			
 
				+51.103.137.65:80
			
 
				+54.66.104.168:80
			
 
				+167.99.236.14:80
			
 
				+178.128.122.245:80
			
 
				+197.243.20.178:80
			
 
				+194.195.240.60:8080
			
 
				+149.129.213.200:8080
			
 
				+97.74.92.60:80
			
 
				+80.66.81.40:8080
			
 
				+103.155.196.22:8181
			
 
				+182.253.235.63:8080
			
 
				+128.199.202.122:8080
			
 
				+110.238.111.229:8080
			
 
				+107.172.73.179:7890
			
 
				+20.206.106.192:8123
			
 
				+72.169.67.85:87
			
 
				+195.31.137.5:80
			
 
				+66.175.223.147:4153
			
 
				+198.11.175.192:8080
			
 
				+206.189.146.13:8080
			
 
				+139.99.237.62:80
			
 
				+45.33.12.251:8080
			
 
				+192.53.163.144:3128
			
 
				+110.34.3.229:3128
			
 
				+47.74.152.29:8888
			
 
				+83.229.72.174:80
			
 
				+37.120.192.154:8080
			
 
				+173.255.209.155:1080
			
 
				+105.16.115.202:80
			
 
				+110.164.3.7:8888
			
 
				+18.207.107.60:80
			
 
				+198.11.175.180:8080
			
 
				+20.24.43.214:8123
			
 
				+52.88.105.39:80
			
 
				+167.71.230.124:8080
			
 
				+193.122.71.184:3128
			
 
				+64.227.23.88:8118
			
 
				+154.239.1.77:1981
			
 
				+187.217.54.84:80
			
 
				+165.154.226.12:80
			
 
				+54.36.239.180:5000
			
 
				+151.181.91.10:80
			
 
				+121.181.111.191:8001
			
 
				+49.207.36.81:80
			
 
				+172.105.231.110:80
			
 
				+193.141.65.48:808
			
 
				+213.230.66.38:8080
			
 
				+139.59.88.145:8888
			
 
				+104.148.36.10:80
			
 
				+79.111.13.155:50625
			
 
				+103.166.28.12:8181
			
 
				+72.169.67.61:87
			
 
				+103.17.246.148:8080
			
 
				+93.100.118.135:8080
			
 
				+87.247.187.9:3128
			
 
				+93.180.135.243:3128
			
 
				+195.138.90.226:3128
			
 
				+157.100.56.181:999
			
 
				+177.53.153.14:999
			
 
				+36.94.183.153:8080
			
 
				+41.254.53.70:1976
			
 
				+103.177.20.148:8181
			
 
				+43.154.216.109:80
			
 
				+159.65.63.209:8888
			
 
				+158.69.71.245:9300
			
 
				+158.69.53.98:9300
			
 
				+176.58.112.123:1080
			
 
				+172.105.184.208:8001
			
 
				+103.152.112.145:80
			
 
				+89.107.197.165:3128
			
 
				+47.91.44.217:8000
			
 
				+154.236.184.71:1974
			
 
				+117.54.114.35:80
			
 
				+198.59.191.234:8080
			
 
				+180.232.123.251:3128
			
 
				+178.79.138.253:8080
			
 
				+3.212.9.208:80
			
 
				+5.254.34.4:3129
			
 
				+82.223.102.92:9443
			
 
				+50.233.228.147:8080
			
 
				+213.214.74.90:80
			
 
				+72.55.155.80:80
			
 
				+44.204.198.120:80
			
 
				+8.209.68.1:8080
			
 
				+121.181.40.87:8001
			
 
				+194.35.127.130:80
			
 
				+68.183.242.248:3128
			
 
				+157.245.207.186:8080
			
 
				+52.200.191.158:80
			
 
				+20.187.77.5:80
			
 
				+43.154.233.149:8080
			
 
				+74.205.128.200:80
			
 
				+35.86.232.240:80
			
 
				+201.229.250.22:8080
			
 
				+167.99.174.59:80
			
 
				+14.139.242.7:80
			
 
				+1.224.3.122:3888
			
 
				+162.144.233.16:80
			
 
				+203.198.207.253:80
			
 
				+41.32.12.190:80
			
 
				+130.185.122.169:80
			
 
				+157.100.26.69:80
			
 
				+143.198.182.218:80
			
 
				+51.83.98.90:80
			
 
				+188.235.0.207:8282
			
 
				+31.220.183.217:53281
			
 
				+41.188.149.79:80
			
 
				+8.209.64.208:8080
			
 
				+92.119.59.241:80
			
 
				+112.120.41.171:80
			
 
				+65.108.145.78:8080
			
 
				+149.129.254.140:80
			
 
				+103.168.129.123:8080
			
 
				+202.180.20.66:8080
			
 
				+103.154.230.129:8080
			
 
				+195.133.49.95:3128
			
 
				+103.17.213.98:8080
			
 
				+45.233.67.226:999
			
 
				+138.117.110.87:999
			
 
				+45.56.83.46:8012
			
 
				+195.201.30.206:5566
			
 
				+103.141.52.218:8000
			
 
				+100.20.122.18:80
			
 
				+3.138.46.196:80
			
 
				+117.54.114.99:80
			
 
				+20.81.62.32:3128
			
 
				+149.129.184.250:8080
			
 
				+159.138.169.48:8080
			
 
				+95.216.88.150:3128
			
 
				+180.94.69.66:8080
			
 
				+117.54.114.102:80
			
 
				+115.243.88.49:80
			
 
				+193.233.210.84:8085
			
 
				+73.32.226.94:8118
			
 
				+23.95.49.244:3128
			
 
				+193.233.229.112:8085
			
 
				+193.168.182.98:8800
			
 
				+85.208.210.66:8085
			
 
				+23.94.238.204:3128
			
 
				+149.18.30.166:8085
			
 
				+138.2.64.185:8118
			
 
				+173.212.195.139:80
			
 
				+51.159.207.156:3128
			
 
				+135.125.1.230:80
			
 
				+51.250.80.131:80
			
 
				+8.209.246.6:80
			
 
				+58.27.59.249:80
			
 
				+104.45.128.122:80
			
 
				+138.91.159.185:80
			
 
				+209.97.152.208:8888
			
 
				+34.81.72.31:80
			
 
				+34.223.105.122:80
			
 
				+100.20.101.185:80
			
 
				+68.183.143.134:80
			
 
				+43.255.113.232:8082
			
 
				+65.21.131.27:80
			
 
				+85.133.229.10:8080
			
 
				+173.249.25.220:80
			
 
				+198.13.54.14:80
			
 
				+134.209.90.106:80
			
 
				+143.198.40.24:8888
			
 
				+157.230.97.17:8888
			
 
				+201.217.49.2:80
			
 
				+174.138.16.96:8888
			
 
				+205.207.103.97:8282
			
 
				+189.82.62.163:8080
			
 
				+185.130.80.81:8080
			
 
				+138.117.231.130:999
			
 
				+200.71.109.237:999
			
 
				+124.105.75.122:8080
			
 
				+117.240.28.81:8080
			
 
				+138.59.187.10:666
			
 
				+135.181.29.13:3128
			
 
				+45.149.41.237:41890
			
 
				+194.1.250.56:8080
			
 
				+102.68.128.212:8080
			
 
				+87.246.54.221:8888
			
 
				+51.91.56.181:3128
			
 
				+139.59.228.95:8118
			
 
				+47.56.69.11:8000
			
 
				+162.0.226.218:80
			
 
				+45.79.208.64:44554
			
 
				+103.103.52.40:44116
			
 
				+43.205.33.122:80
			
 
				+149.129.239.170:8080
			
 
				+110.238.109.146:8080
			
 
				+222.252.156.61:62694
			
 
				+31.200.229.104:56471
			
 
				+178.63.133.25:80
			
 
				+116.0.61.122:3128
			
 
				+93.191.96.4:3128
			
 
				+201.238.248.139:9229
			
 
				+186.193.246.32:8080
			
 
				+202.152.51.44:8080
			
 
				+187.251.107.143:8080
			
 
				+103.142.241.142:3127
			
 
				+103.121.120.69:8080
			
 
				+45.235.46.94:8080
			
 
				+78.83.199.235:53281
			
 
				+88.218.17.112:3129
			
 
				+119.18.152.210:3127
			
 
				+103.146.30.178:8080
			
 
				+173.196.205.170:8080
			
 
				+103.144.18.67:8082
			
 
				+181.143.224.43:999
			
 
				+188.132.222.4:8080
			
 
				+202.77.115.69:8182
			
 
				+102.222.146.203:8080
			
 
				+114.79.146.137:8080
			
 
				+212.108.144.67:8080
			
 
				+103.122.64.234:3125
			
 
				+103.83.179.150:8080
			
 
				+110.164.162.42:8080
			
 
				+206.62.137.57:8080
			
 
				+45.79.253.142:3128
			
 
				+100.20.156.53:80
			
 
				+3.226.168.144:80
			
 
				+139.59.61.115:80
			
 
				+103.106.193.137:7532
			
 
				+162.243.174.235:80
			
 
				+159.138.252.45:8080
			
 
				+207.180.250.238:80
			
 
				+103.171.182.230:8080
			
 
				+134.122.58.174:80
			
 
				+81.94.255.12:8080
			
 
				+121.139.218.165:31409
			
 
				+92.205.22.114:38080
			
 
				+47.254.237.222:8080
			
 
				+13.81.217.201:80
			
 
				+54.246.207.78:80
			
 
				+52.47.137.181:80
			
 
				+200.105.215.18:33630
			
 
				+47.245.34.161:8080
			
 
				+202.180.20.11:55443
			
 
				+47.253.105.175:5566
			
 
				+121.151.97.238:8001
			
 
				+117.54.114.97:80
			
 
				+193.56.118.205:443
			
 
				+45.79.90.143:44554
			
 
				+35.84.133.18:80
			
 
				+54.175.197.235:80
			
 
				+213.230.97.98:3128
			
 
				+110.238.74.184:8080
			
 
				+139.59.43.194:3128
			
 
				+178.115.253.35:8080
			
 
				+41.76.221.33:8088
			
--- a/Crawl_Baomoi/get_random_free_proxy.py
+++ b/Crawl_Baomoi/get_random_free_proxy.py
@@ -0,0 +1,42 @@
 
				+import requests
			
 
				+import random
			
 
				+from bs4 import BeautifulSoup as bs
			
 
				+import traceback
			
 
				+import time
			
 
				+
			
 
				+def get_free_proxies():
			
 
				+    url = "https://free-proxy-list.net/"
			
 
				+    # request and grab content
			
 
				+    soup = bs(requests.get(url).content, 'html.parser')
			
 
				+    # to store proxies
			
 
				+    proxies = []
			
 
				+    for row in soup.find("table", attrs={"class": "table-striped"}).find_all("tr")[1:]:
			
 
				+        tds = row.find_all("td")
			
 
				+        try:
			
 
				+            ip = tds[0].text.strip()
			
 
				+            port = tds[1].text.strip()
			
 
				+            proxies.append(str(ip) + ":" + str(port))
			
 
				+        except IndexError:
			
 
				+            continue
			
 
				+    return proxies
			
 
				+
			
 
				+url = "http://ipinfo.io/json"
			
 
				+proxies = get_free_proxies()
			
 
				+print(proxies)
			
 
				+
			
 
				+def first_proxy(proxies):
			
 
				+    while True:
			
 
				+        proxy = proxies[random.randint(0,len(proxies)-1)]
			
 
				+        try:
			
 
				+            response = requests.get(url, proxies = {"http":"http://"+str(proxy), "https":"https://"+str(proxy)}, timeout=1)
			
 
				+            print(response.json()['country'])
			
 
				+            print(response.json()['region'])
			
 
				+            print(response.text)
			
 
				+            break
			
 
				+        except:
			
 
				+            pass
			
 
				+            # if the proxy Ip is preoccupied
			
 
				+            # print("Not Available")
			
 
				+    return proxy
			
 
				+
			
 
				+print(first_proxy(proxies))
			
--- a/Crawl_Baomoi/gologin_zeroprofile.zip
+++ b/Crawl_Baomoi/gologin_zeroprofile.zip
--- a/Crawl_Baomoi/links2.txt
+++ b/Crawl_Baomoi/links2.txt
--- a/Crawl_Baomoi/links2_tmp.txt
+++ b/Crawl_Baomoi/links2_tmp.txt
--- a/Crawl_Baomoi/proxylist.txt
+++ b/Crawl_Baomoi/proxylist.txt
@@ -0,0 +1,47 @@
 
				+45.91.93.166:22413
			
 
				+92.53.90.84:5031
			
 
				+176.126.84.126:16432
			
 
				+195.154.43.86:52110
			
 
				+162.19.7.58:61711
			
 
				+195.154.43.198:50288
			
 
				+134.195.91.76:22613
			
 
				+89.248.165.79:5478
			
 
				+45.132.75.19:25543
			
 
				+66.29.128.246:59446
			
 
				+162.19.7.48:49901
			
 
				+66.29.128.244:27264
			
 
				+37.221.193.221:25830
			
 
				+37.221.193.221:30479
			
 
				+31.220.43.141:12823
			
 
				+185.209.30.138:4035
			
 
				+176.126.84.126:35711
			
 
				+176.126.84.126:12305
			
 
				+209.159.153.21:49169
			
 
				+37.221.193.221:12833
			
 
				+45.132.75.19:34380
			
 
				+37.221.193.221:15098
			
 
				+185.209.30.138:4064
			
 
				+162.0.220.220:30537
			
 
				+37.221.193.221:24987
			
 
				+45.91.92.30:24636
			
 
				+31.220.43.141:27203
			
 
				+162.19.7.46:35983
			
 
				+176.126.84.126:25188
			
 
				+37.221.193.221:26486
			
 
				+45.132.75.19:14733
			
 
				+66.29.129.52:24365
			
 
				+51.254.149.59:49884
			
 
				+38.91.107.229:17261
			
 
				+51.83.116.5:11820
			
 
				+31.220.43.141:17452
			
 
				+31.220.43.141:23578
			
 
				+31.220.43.141:27172
			
 
				+162.19.7.48:24982
			
 
				+176.126.84.126:30183
			
 
				+31.220.43.141:27357
			
 
				+37.221.193.221:22340
			
 
				+162.19.7.58:36635
			
 
				+45.132.75.19:30709
			
 
				+134.195.91.76:29261
			
 
				+174.138.176.75:33861
			
 
				+89.248.165.79:5480
			
--- a/Crawl_Baomoi/testing.txt
+++ b/Crawl_Baomoi/testing.txt
--- a/Crawl_Baomoi/testing_scrape.py
+++ b/Crawl_Baomoi/testing_scrape.py
@@ -0,0 +1,105 @@
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+from sys import platform
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from pygologin.gologin import GoLogin
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.common.exceptions import NoSuchElementException
			
 
				+from selenium.common.exceptions import StaleElementReferenceException
			
 
				+import json
			
 
				+
			
 
				+from underthesea import ner
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+from googlesearch import *
			
 
				+
			
 
				+from datetime import datetime
			
 
				+from datetime import timedelta
			
 
				+"""
			
 
				+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzNkZDJlOWYwMzIwMjBkYWQwNDU2ZTciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzNkZDM0YWM5OWFmMmMzMzdkMjNmNGQifQ.7UmxqoGmN25EwG1DmN-2aJZqbBUY3R4hgKJciKgUwRg"
			
 
				+
			
 
				+link="https://ipinfo.io/"
			
 
				+
			
 
				+gl = GoLogin({
			
 
				+	"token": TOKEN,
			
 
				+    'tmpdir':"/tmp/",
			
 
				+    "local":True,
			
 
				+    "credentials_enable_service": False,
			
 
				+})
			
 
				+
			
 
				+profile_id = gl.create({
			
 
				+    "name": 'profile_1',
			
 
				+    "os": 'mac',
			
 
				+    "proxyEnabled": True,
			
 
				+    "navigator": {
			
 
				+        "language": 'en-US,en;q=0.9,he;q=0.8',
			
 
				+        "userAgent": 'MyUserAgent',
			
 
				+        "resolution": '1024x768',
			
 
				+        "platform": 'darwin',
			
 
				+    },
			
 
				+    "proxy":{
			
 
				+        'mode': 'http',
			
 
				+        'host': "139.99.237.62",
			
 
				+        'port': "80",
			
 
				+        'username': "",
			
 
				+        'password': "",
			
 
				+    }
			
 
				+});
			
 
				+'host': "139.99.237.62",
			
 
				+        'port': ,
			
 
				+        'username': "",
			
 
				+        'password': "",
			
 
				+
			
 
				+
			
 
				+
			
 
				+gl = GoLogin({
			
 
				+	"token": TOKEN,
			
 
				+    'profile_id':profile_id,
			
 
				+})
			
 
				+
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+debugger_address = gl.start()
			
 
				+chrome_options = Options()
			
 
				+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+
			
 
				+driver.get(link)
			
 
				+gl.delete(profile_id)
			
 
				+driver.close()
			
 
				+print("end session!")
			
 
				+# ----------------------------
			
 
				+
			
 
				+
			
 
				+
			
 
				+"""
			
 
				+link="https://toquoc.vn/van-hoa-khong-co-su-cao-thap-nho-hay-lon-ma-chi-co-su-da-dang-net-dac-sac-tieu-bieu-can-duoc-ton-trong-ton-vinh-phat-huy-giu-gin-20221006225030042.htm"
			
 
				+news = {}
			
 
				+t_title = ""
			
 
				+t_description = ""
			
 
				+t_contents = ''
			
 
				+url = requests.get(link)
			
 
				+t_soup = BeautifulSoup(url.text, 'lxml')
			
 
				+
			
 
				+
			
 
				+for title in t_soup.findAll('h1', {'class': 'entry-title'}):
			
 
				+    t_title = title.text
			
 
				+for description in t_soup.findAll('h2', {'class': 'sapo'}):
			
 
				+    t_description = description.text
			
 
				+for contents in t_soup.findAll('div', {'data-role': 'content'}):
			
 
				+    for content in contents.findAll('p'):
			
 
				+        t_contents += content.text + ". "
			
 
				+
			
 
				+
			
 
				+
			
 
				+news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
			
 
				+print(news)
			
--- a/Crawl_Baomoi/testing_tmp.py
+++ b/Crawl_Baomoi/testing_tmp.py
@@ -0,0 +1,100 @@
 
				+from sys import platform
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+import sys,os
			
 
				+sys.path.append('../pygologin')
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from pygologin.gologin import GoLogin
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.common.exceptions import NoSuchElementException
			
 
				+from selenium.common.exceptions import StaleElementReferenceException
			
 
				+import json
			
 
				+
			
 
				+from underthesea import ner
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+from datetime import datetime
			
 
				+from datetime import timedelta
			
 
				+
			
 
				+
			
 
				+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzZiNjdiNTVhMTI5NzNmY2FiMzdlMTAiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzZiNjgwNDczY2QwZDFiNjNmYmM5YTIifQ.VfF22lLEMP3JSklvuWTgOfkxEKKHCcsSYQotg6zMcac"
			
 
				+gl = GoLogin({
			
 
				+    "token": TOKEN,
			
 
				+    'profile_id': "636b67b55a12973346b37e12",
			
 
				+})
			
 
				+
			
 
				+
			
 
				+proxy_list=[]
			
 
				+for line in open("proxylist.txt","r"):
			
 
				+    line=line[:-1]
			
 
				+    proxy_list.append(line.split(":"))
			
 
				+proxy_check=[]
			
 
				+
			
 
				+
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+debugger_address = gl.start()
			
 
				+chrome_options = Options()
			
 
				+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+driver.close()
			
 
				+
			
 
				+def creat_new_profile_id(gl, i):
			
 
				+    host=proxy_list[i][0]
			
 
				+    port=proxy_list[i][1]
			
 
				+    profile_id = gl.create({
			
 
				+        "name": 'profile_'+str(i),
			
 
				+        "os": 'mac',
			
 
				+        "proxyEnabled": True,
			
 
				+        "navigator": {
			
 
				+            "language": 'en-US,en;q=0.9,he;q=0.8',
			
 
				+            "userAgent": 'MyUserAgent',
			
 
				+            "resolution": '1024x768',
			
 
				+            "platform": 'darwin',
			
 
				+        },
			
 
				+        "proxy":{
			
 
				+            'mode': 'socks5',
			
 
				+            'host': host,
			
 
				+            'port': port,
			
 
				+            'username': "prateep6793",
			
 
				+            'password': "Zing1234",
			
 
				+        }
			
 
				+    });
			
 
				+    return profile_id
			
 
				+
			
 
				+def clear_proxy_list(gl,driver):
			
 
				+    for i in range(len(proxy_list)-1):
			
 
				+        print(i)
			
 
				+        try:
			
 
				+            profile_id=creat_new_profile_id(gl,i)
			
 
				+            gl = GoLogin({
			
 
				+                "token": TOKEN,
			
 
				+                'profile_id': profile_id,
			
 
				+            })
			
 
				+            chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
			
 
				+            debugger_address = gl.start()
			
 
				+            chrome_options = Options()
			
 
				+            chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+            driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
			
 
				+            proxy_check.append(True)
			
 
				+            driver.get("https://www.google.com/")
			
 
				+            print("Okay")
			
 
				+        except:
			
 
				+            print("Error Proxy!")
			
 
				+            proxy_check.append(False)
			
 
				+        time.sleep(2)
			
 
				+        driver.refresh()
			
 
				+        driver.close()
			
 
				+        gl.delete(profile_id)
			
 
				+
			
 
				+
			
 
				+clear_proxy_list(gl,driver)
			
 
				+print(proxy_list)
			
 
				+
			
--- a/Crawl_Baomoi/testingscrape2.py
+++ b/Crawl_Baomoi/testingscrape2.py
@@ -0,0 +1,33 @@
 
				+import requests
			
 
				+import random
			
 
				+
			
 
				+import requests
			
 
				+from requests.auth import HTTPProxyAuth
			
 
				+from bs4 import BeautifulSoup as bs
			
 
				+import traceback
			
 
				+#from proxybroker import checker
			
 
				+
			
 
				+url = "http://ipinfo.io/json"
			
 
				+proxy_list=[]
			
 
				+for line in open("proxylist.txt","r"):
			
 
				+    line=line[:-1]
			
 
				+    proxy_list.append(line)
			
 
				+
			
 
				+print(proxy_list)
			
 
				+#checker.ProxyChecker()
			
 
				+
			
 
				+auth=HTTPProxyAuth("prateep6793","Zing1234")
			
 
				+
			
 
				+for i in range(len(proxy_list)):
			
 
				+
			
 
				+    #printing req number
			
 
				+    print("Request Number : " + str(i+1))
			
 
				+    proxy = proxy_list[i]
			
 
				+    #print(proxy)
			
 
				+    try:
			
 
				+        response = requests.get(url, proxies = {"http":"socks5://"+proxy, "https":"socks5://"+proxy},auth=auth, timeout=10)
			
 
				+        print("ok!")
			
 
				+    except:
			
 
				+        # if the proxy Ip is pre occupied
			
 
				+        print("Not Available")
			
 
				+
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
 
				+# ndtai
			
--- a/RSStest.py
+++ b/RSStest.py
@@ -0,0 +1,39 @@
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+import time
			
 
				+start=time.time()
			
 
				+
			
 
				+def crawl():
			
 
				+    f = open("testingrss.txt", "w")
			
 
				+
			
 
				+    url = requests.get('https://vnexpress.net/rss/the-thao.rss')
			
 
				+
			
 
				+    soup = BeautifulSoup(url.content, 'xml')
			
 
				+    items = soup.find_all('item')
			
 
				+
			
 
				+    for item in items:
			
 
				+        title = item.title.text
			
 
				+        print(title + '\n')
			
 
				+    # -------------------------------------------
			
 
				+    url = requests.get('https://vnexpress.net/rss/thoi-su.rss')
			
 
				+
			
 
				+    soup = BeautifulSoup(url.content, 'xml')
			
 
				+    items = soup.find_all('item')
			
 
				+
			
 
				+    for item in items:
			
 
				+        title = item.title.text
			
 
				+        print(title + '\n')
			
 
				+    # -------------------------------------------
			
 
				+    url = requests.get('https://vnexpress.net/rss/giao-duc.rss')
			
 
				+
			
 
				+    soup = BeautifulSoup(url.content, 'xml')
			
 
				+    items = soup.find_all('item')
			
 
				+
			
 
				+    for item in items:
			
 
				+        title = item.title.text
			
 
				+        print(title + '\n')
			
 
				+    # ------------------
			
 
				+    f.close()
			
 
				+
			
 
				+crawl()
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
--- a/Suggest_news.py
+++ b/Suggest_news.py
@@ -0,0 +1,78 @@
 
				+import time
			
 
				+import re
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+from bs4 import element
			
 
				+
			
 
				+import bs4
			
 
				+import requests
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+
			
 
				+
			
 
				+news={}
			
 
				+#news[0]={'title':'','link':'','content':''}
			
 
				+
			
 
				+
			
 
				+def scraping_soup(link, category, page):
			
 
				+    url = requests.get(link)
			
 
				+    if (page=="vnexpress"):
			
 
				+        soup = BeautifulSoup(url.content, 'lxml')
			
 
				+    else:
			
 
				+        soup = BeautifulSoup(url.content, 'html.parser')
			
 
				+    items = soup.findAll('item')
			
 
				+    i=0
			
 
				+    for item in items:
			
 
				+
			
 
				+        title = item.title.text
			
 
				+        link = item.guid.text
			
 
				+        description = item.description.text
			
 
				+        print(title)
			
 
				+        #--------
			
 
				+
			
 
				+
			
 
				+        news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}
			
 
				+        i+=1
			
 
				+        if i==30: break
			
 
				+
			
 
				+
			
 
				+#def Suggest_news_thethao():
			
 
				+    #scraping_soup('https://vnexpress.net/rss/the-thao.rss','thethao', 'vnexpress')
			
 
				+
			
 
				+
			
 
				+def Suggest_news_thoisu_chinhtri():
			
 
				+    scraping_soup('https://vnexpress.net/rss/thoi-su.rss', 'thoisu', 'vnexpress')
			
 
				+    #scraping_soup('https://vtv.vn/trong-nuoc/chinh-tri.rss', 'thoisu')
			
 
				+    scraping_soup('https://toquoc.vn/rss/thoi-su-1.rss', 'thoisu','toquoc')
			
 
				+    #scraping_soup('https://baotintuc.vn/thoi-su.rss', 'thoisu', 'baotintuc')
			
 
				+    #scraping_soup('https://vietnamnet.vn/rss/thoi-su.rss', 'thoisu', 'vietnamnet')
			
 
				+   # scraping_soup('https://laodong.vn/rss/thoi-su.rss', 'thoisu', 'laodong')
			
 
				+
			
 
				+
			
 
				+#def Suggest_news_vanhoa():
			
 
				+    #scraping_soup('https://toquoc.vn/rss/van-hoa-10.rss', 'vanhoa', 'toquoc')
			
 
				+    #scraping_soup('https://baotintuc.vn/van-hoa.rss', 'vanhoa', 'baotintuc')
			
 
				+    #scraping_soup('https://laodong.vn/rss/van-hoa-giai-tri.rss', 'vanhoa', 'laodong')
			
 
				+
			
 
				+
			
 
				+
			
 
				+#Suggest_news_thethao()
			
 
				+Suggest_news_thoisu_chinhtri()
			
 
				+#Suggest_news_vanhoa()
			
 
				+
			
 
				+print(news)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/__pycache__/Suggest_news.cpython-310.pyc
+++ b/__pycache__/Suggest_news.cpython-310.pyc
--- a/__pycache__/creatdata.cpython-310.pyc
+++ b/__pycache__/creatdata.cpython-310.pyc
--- a/__pycache__/main.cpython-310.pyc
+++ b/__pycache__/main.cpython-310.pyc
--- a/baothethao.txt
+++ b/baothethao.txt
--- a/baothethao2.txt
+++ b/baothethao2.txt
--- a/count_dict.txt
+++ b/count_dict.txt
--- a/crawldatanews3.py
+++ b/crawldatanews3.py
@@ -0,0 +1,55 @@
 
				+import sys
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+import time
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+fi=open("linksthethao.txt","r")
			
 
				+fo=open("baothethao2.txt","w")
			
 
				+
			
 
				+i=0
			
 
				+
			
 
				+for line in fi.readlines():
			
 
				+    link=line.strip()
			
 
				+    url=requests.get(link)
			
 
				+
			
 
				+    t_soup = BeautifulSoup(url.text, 'lxml')
			
 
				+
			
 
				+    for headline in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
			
 
				+        fo.write(headline.text)
			
 
				+        fo.write('\n')
			
 
				+    for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
			
 
				+        fo.write(description.text)
			
 
				+        fo.write('\n')
			
 
				+    str = ''
			
 
				+    for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
			
 
				+        for content in contents.findAll('p'):
			
 
				+            fo.write(content.text)
			
 
				+            fo.write('\n')
			
 
				+
			
 
				+    i+=1
			
 
				+    print(i)
			
 
				+    if (i==50): break
			
 
				+
			
 
				+
			
 
				+fi.close()
			
 
				+fo.close()
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/creatdata.py
+++ b/creatdata.py
@@ -0,0 +1,14 @@
 
				+import json
			
 
				+
			
 
				+twowords=set()
			
 
				+stopwords=set()
			
 
				+
			
 
				+for line in open("vietdict.txt", 'r'):
			
 
				+    a=json.loads(line)
			
 
				+    t_str=a["text"]
			
 
				+    tmp=str.split(t_str)
			
 
				+    if (len(tmp)==2):
			
 
				+        set.add(twowords,t_str)
			
 
				+
			
 
				+for line in open("vietnamese-stopwords.txt", 'r'):
			
 
				+    set.add(stopwords,line[:-1])
			
--- a/gologin_zeroprofile.zip
+++ b/gologin_zeroprofile.zip
--- a/gui.py
+++ b/gui.py
@@ -0,0 +1,58 @@
 
				+from tkinter import *
			
 
				+import tkinter as tk
			
 
				+from tkinter import ttk
			
 
				+import webbrowser
			
 
				+from main import res_cos
			
 
				+from main import link_title
			
 
				+
			
 
				+# MacOS
			
 
				+chrome_path = 'open -a /Applications/Google\ Chrome.app %s'
			
 
				+
			
 
				+
			
 
				+
			
 
				+root = tk.Tk()
			
 
				+root.geometry("400x400")
			
 
				+root.title("News Widget")
			
 
				+tabControl = ttk.Notebook(root)
			
 
				+
			
 
				+tab1 = ttk.Frame(tabControl)
			
 
				+tab2 = ttk.Frame(tabControl)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def weblink(*args):
			
 
				+    index = lbx1.curselection()[0]
			
 
				+    item = lbx1.get(index)
			
 
				+    webbrowser.open_new(link_title[item])
			
 
				+    lbx1.delete(index)
			
 
				+    lbx2.insert(lbx2.size(),item)
			
 
				+
			
 
				+tabControl.add(tab1, text='News')
			
 
				+tabControl.add(tab2, text='History')
			
 
				+tabControl.pack(expand=1, fill="both")
			
 
				+lbx1 = tk.Listbox(tab1,width=40,height=40)
			
 
				+lbx2 = tk.Listbox(tab2,width=40,height=40)
			
 
				+
			
 
				+lbx1.bind('<<ListboxSelect>>', weblink)
			
 
				+for key in res_cos:
			
 
				+    lbx1.insert(END, key)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+lbx1.pack()
			
 
				+lbx2.pack()
			
 
				+
			
 
				+root.mainloop()
			
 
				+"""
			
 
				+lbx=Listbox(root,width=40,height=40)
			
 
				+lbx.pack(pady=15)
			
 
				+
			
 
				+#AddItems
			
 
				+for i in range(30):
			
 
				+    lbx.insert(i,"this is a test "+str(i))
			
 
				+
			
 
				+
			
 
				+"""
			
--- a/linksthethao.txt
+++ b/linksthethao.txt
@@ -0,0 +1,70 @@
 
				+https://thanhnien.vn/cac-nuoc-chuc-mung-quoc-khanh-viet-nam-post1495932.html
			
 
				+https://thanhnien.vn/tong-bi-thu-nguyen-phu-trong-dang-huong-tuong-niem-chu-tich-ho-chi-minh-post1495512.html
			
 
				+https://thanhnien.vn/lanh-dao-dang-nha-nuoc-vieng-chu-tich-ho-chi-minh-nhan-quoc-khanh-29-post1495472.html
			
 
				+https://thanhnien.vn/tphcm-phat-huy-vun-dap-moi-quan-he-huu-nghi-va-hop-tac-viet-nam---lao-post1493485.html
			
 
				+https://thanhnien.vn/thu-tuong-pham-minh-chinh-cac-ton-giao-luon-cung-dat-nuoc-vuot-qua-kho-khan-thach-thuc-post1493404.html
			
 
				+https://thanhnien.vn/can-giai-phap-de-doan-vien-thanh-nien-tham-gia-xay-dung-dang-post1494969.html
			
 
				+https://thanhnien.vn/tong-thong-belarus-quan-doi-ukraine-se-can-thiep-chinh-tri-de-xung-dot-som-ket-thuc-post1495942.html
			
 
				+https://thanhnien.vn/pho-sa-te-cao-van-lau-40-nam-hut-khach-nho-nuoc-leo-dau-phong-doc-dao-post1495770.html
			
 
				+https://thanhnien.vn/lay-ly-do-ro-ri-nga-dong-duong-ong-nord-stream-1-vo-thoi-han-post1496002.html
			
 
				+https://thanhnien.vn/tau-san-bay-noi-dia-dau-tien-cua-an-do-co-ti-le-noi-dia-hoa-bao-nhieu-post1495970.html
			
 
				+https://thanhnien.vn/antony-ra-mat-o-tran-manchester-united-arsenal-co-qua-voi-vang-post1496012.html
			
 
				+https://thanhnien.vn/csgt-tphcm-truy-duoi-khong-che-nghi-pham-cuop-xe-may-post1495965.html
			
 
				+https://thanhnien.vn/ninh-binh-mot-nguoi-bi-dien-giat-tu-vong-khi-dang-sua-duong-dien-post1496049.html
			
 
				+https://thanhnien.vn/5-nhom-trieu-chung-can-can-thiep-y-te-khan-cap-sau-mac-covid-19-post1496048.html
			
 
				+https://thanhnien.vn/quang-binh-200-canh-sat-vay-bat-hang-chuc-nguoi-bay-lac-o-karaoke-thien-duong-2-post1496043.html
			
 
				+https://thanhnien.vn/tphcm-trom-chim-bi-truy-duoi-lien-xit-hoi-cay-chong-tra-post1496070.html
			
 
				+https://thanhnien.vn/kon-tum-xac-minh-thiet-hai-cua-nguoi-trong-sam-ngoc-linh-de-xem-xet-khoanh-no-post1495955.html
			
 
				+https://thanhnien.vn/vu-khai-thac-lau-15-trieu-tan-quang-van-ban-trai-luat-cua-ubnd-tinh-lao-cai-post1496019.html
			
 
				+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
			
 
				+https://thanhnien.vn/pv-thanh-nien-ho-tro-giai-cuu-2-nguoi-bi-lua-ban-sang-campuchia-post1495931.html
			
 
				+https://thanhnien.vn/pho-bien-phan-thiet-dang-khoac-chiec-ao-chat-post1495882.html
			
 
				+https://thanhnien.vn/vi-sao-giam-doc-so-tn-mt-thanh-hoa-mai-nhu-thang-bat-ngo-xin-chuyen-cong-tac-post1489705.html
			
 
				+https://thanhnien.vn/phat-huy-suc-manh-cua-nhan-dan-trong-su-nghiep-bao-ve-an-ninh-quoc-gia-post1489659.html
			
 
				+https://thanhnien.vn/ky-luat-nhieu-can-bo-lien-quan-dai-an-binh-duong-post1489663.html
			
 
				+https://thanhnien.vn/chu-tich-nuoc-chu-tri-hoi-thao-ve-bao-ve-to-quoc-trong-tinh-hinh-moi-post1488764.html
			
 
				+https://thanhnien.vn/quang-ninhpho-thu-tuong-pham-binh-minh-du-ngay-hoi-toan-dan-bao-ve-an-ninh-to-quoc-post1488511.html
			
 
				+https://thanhnien.vn/phong-trao-bao-ve-an-ninh-to-quoc-that-su-la-noi-binh-yen-dang-song-post1488466.html
			
 
				+https://thanhnien.vn/giu-gin-lau-dai-bao-ve-tuyet-doi-an-toan-thi-hai-chu-tich-ho-chi-minh-post1488249.html
			
 
				+https://thanhnien.vn/bo-nhiem-vien-truong-vien-ksnd-tinh-dong-nai-post1487484.html
			
 
				+https://thanhnien.vn/ngay-hoi-toan-dan-bao-ve-an-ninh-to-quoc-phong-phu-thiet-thuc-post1487368.html
			
 
				+https://thanhnien.vn/day-du-co-so-de-xay-dung-nha-nuoc-phap-quyen-xhcn-post1487367.html
			
 
				+https://thanhnien.vn/dong-thap-dieu-dong-2-bi-thu-huyen-ve-lam-pho-ban-to-chuc-va-tuyen-giao-post1487296.html
			
 
				+https://thanhnien.vn/bo-cong-an-to-chuc-ky-niem-38-nam-thang-loi-ke-hoach-phan-gian-cm12-post1487287.html
			
 
				+https://thanhnien.vn/nghien-cuu-nhan-rong-mo-hinh-cong-nhan-moi-truong-chien-si-tuan-tra-post1486150.html
			
 
				+https://thanhnien.vn/nguyen-chu-tich-nuoc-truong-tan-sang-nhan-huy-hieu-50-nam-tuoi-dang-post1486112.html
			
 
				+https://thanhnien.vn/ky-niem-110-nam-ngay-sinh-chu-tich-hoi-dong-nha-nuoc-vo-chi-cong-post1488689.html
			
 
				+https://thanhnien.vn/chu-tich-nuoc-nguyen-xuan-phuc-dang-huong-khu-luu-niem-nha-cach-mang-vo-chi-cong-post1485634.html
			
 
				+https://thanhnien.vn/pho-chu-tich-quoc-hoi-tran-thanh-man-du-khai-giang-nam-hoc-moi-tai-kien-giang-post1495993.html
			
 
				+https://thanhnien.vn/cach-day-con-dung-dan-nhan-cho-con-post1495888.html
			
 
				+https://thanhnien.vn/boc-tham-may-rui-vao-truong-mam-non-dau-tu-xay-nha-dung-bo-quen-truong-hoc-post1495938.html
			
 
				+https://thanhnien.vn/thu-tuong-khong-de-hoc-sinh-nao-khong-duoc-toi-truong-khong-co-sach-giao-khoa-post1496045.html
			
 
				+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
			
 
				+https://thanhnien.vn/5-nhom-trieu-chung-can-can-thiep-y-te-khan-cap-sau-mac-covid-19-post1496048.html
			
 
				+https://thanhnien.vn/cu-tri-ba-ria---vung-tau-than-kho-mua-ve-bay-con-dao-bo-gtvt-noi-gi-post1496087.html
			
 
				+https://thanhnien.vn/khu-nhac-nuoc-lon-nhat-dong-nam-a-mo-cua-don-khach-post1496024.html
			
 
				+https://thanhnien.vn/nhieu-cua-hang-o-ca-mau-het-xang-dau-post1496010.html
			
 
				+https://thanhnien.vn/he-lo-nhung-thu-ong-trump-quan-tam-khi-con-la-tong-thong-my-post1496060.html
			
 
				+https://thanhnien.vn/tham-quyen-phong-toa-phan-lon-thanh-pho-tien-hanh-xet-nghiem-covid-19-post1496093.html
			
 
				+https://thanhnien.vn/nong-phi-cong-lai-may-bay-luon-vong-doa-lao-vao-sieu-thi-walmart-o-my-post1496095.html
			
 
				+https://thanhnien.vn/boc-tham-vao-truong-mam-non-loi-khong-chi-thuoc-ve-nganh-giao-duc-post1495478.html
			
 
				+https://thanhnien.vn/nhung-nhom-nho-thien-nguyen-post1490701.html
			
 
				+https://thanhnien.vn/trut-gian-loa-phuong-post1482932.html
			
 
				+https://thanhnien.vn/vu-khai-thac-lau-15-trieu-tan-quang-van-ban-trai-luat-cua-ubnd-tinh-lao-cai-post1496019.html
			
 
				+https://thanhnien.vn/tphcm-di-bo-sang-duong-nguoi-phu-nu-bi-cuop-giat-dien-thoai-vi-tien-post1495874.html
			
 
				+https://thanhnien.vn/thua-thien---hue-nu-sinh-lop-8-bi-ban-danh-sau-buoi-hoc-noi-quy-post1495850.html
			
 
				+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
			
 
				+https://thanhnien.vn/tu-vu-cho-pitbull-tan-cong-nguoi-phu-nu-tu-vong-nen-cam-nuoi-cho-pitbull-post1496011.html
			
 
				+https://thanhnien.vn/nguoi-dan-o-ha-tinh-thiet-hai-nang-do-ngao-nuoi-chet-hang-loat-post1496031.html
			
 
				+https://thanhnien.vn/dong-nai-giai-ngan-311-ti-dong-ho-tro-tien-thue-tro-cho-cong-nhan-post1493145.html
			
 
				+https://thanhnien.vn/binh-phuoc-da-co-1011-dia-phuong-chi-tra-tien-thue-nha-cho-nguoi-lao-dong-post1493378.html
			
 
				+https://thanhnien.vn/tphcm-1000-cong-nhan-cong-ty-nidec-servo-ngung-viec-kien-nghi-tang-luong-post1491722.html
			
 
				+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
			
 
				+https://thanhnien.vn/livestream-o-phien-toa-van-co-the-bi-phat-tien-post1493272.html
			
 
				+https://thanhnien.vn/khi-nao-cccd-gan-chip-va-dinh-danh-dien-tu-phat-huy-het-chuc-nang-post1492779.html
			
 
				+https://thanhnien.vn/dan-que-me-lam-du-lich-la-lung-ly-lao-lo-post1495787.html
			
 
				+https://thanhnien.vn/hoang-than-do-souphanouvong-voi-viet-nam-tro-thanh-nguoi-chien-si-cach-mang-post1495786.html
			
 
				+https://thanhnien.vn/giu-bien-dao-vung-dong-bac-post1495671.html
			
 
				+https://thanhnien.vn/to-quoc-o-truong-sa-post1495833.html
			
 
				+https://thanhnien.vn/phat-trien-cong-nghiep-quoc-phong-chu-dong-va-hien-dai-post1490827.html
			
 
				+https://thanhnien.vn/chu-tich-nuoc-chu-tri-hoi-thao-ve-bao-ve-to-quoc-trong-tinh-hinh-moi-post1488764.html
			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,209 @@
 
				+#Importing required module
			
 
				+import numpy as np
			
 
				+from nltk.tokenize import  word_tokenize
			
 
				+import nltk
			
 
				+nltk.download('punkt')
			
 
				+from creatdata import twowords
			
 
				+from creatdata import stopwords
			
 
				+import time
			
 
				+import json
			
 
				+from Suggest_news import news
			
 
				+
			
 
				+
			
 
				+
			
 
				+start=time.time()
			
 
				+
			
 
				+
			
 
				+#print(len(twowords))
			
 
				+
			
 
				+#------------------------------------some functions---------------------------------------------------------
			
 
				+def count_dict(sentences):
			
 
				+    word_count = {}
			
 
				+    for word in word_set:
			
 
				+        word_count[word] = 0
			
 
				+        for sent in sentences:
			
 
				+            if word in sent:
			
 
				+                word_count[word] += 1
			
 
				+    return word_count
			
 
				+
			
 
				+#Term Frequency
			
 
				+def termfreq(document, word):
			
 
				+    N = len(document)
			
 
				+    occurance = len([token for token in document if token == word])
			
 
				+    return occurance/N
			
 
				+
			
 
				+#Inverse Document Frequency
			
 
				+def inverse_doc_freq(word):
			
 
				+    try:
			
 
				+        word_occurance = word_count[word] + 1
			
 
				+    except:
			
 
				+        word_occurance = 1
			
 
				+    return np.log(total_documents/word_occurance)
			
 
				+
			
 
				+#TF-IDF
			
 
				+def tf_idf(sentence):
			
 
				+    tf_idf_vec = np.zeros((len(word_set),))
			
 
				+    for word in sentence:
			
 
				+      if word in word_set:
			
 
				+        tf = termfreq(sentence,word)
			
 
				+        idf = inverse_doc_freq(word)
			
 
				+        value = tf*idf
			
 
				+        tf_idf_vec[index_dict[word]] = value
			
 
				+    return tf_idf_vec
			
 
				+
			
 
				+#Cosine similarity
			
 
				+def cos_cal(vec_a,vec_b):
			
 
				+  cos_sim = np.inner(vec_a, vec_b)/(np.linalg.norm(vec_a)* np.linalg.norm(vec_b))
			
 
				+  return cos_sim
			
 
				+
			
 
				+#jaccard similarity
			
 
				+def jaccard_similarity(A, B):
			
 
				+
			
 
				+    tmp=0
			
 
				+    for i in range(len(A)):
			
 
				+        if A[i]==B[i]: tmp+=1
			
 
				+
			
 
				+    return tmp/len(A)
			
 
				+
			
 
				+
			
 
				+#*********************************************************************************************
			
 
				+
			
 
				+#Preprocessing the text data
			
 
				+sentences = []
			
 
				+word_set = []
			
 
				+
			
 
				+with open("baothethao2.txt") as f:
			
 
				+  contents = f.readlines()
			
 
				+
			
 
				+#print(contents)
			
 
				+
			
 
				+for sent in contents:
			
 
				+    tmp=""
			
 
				+    check=False
			
 
				+    x = [i.lower() for  i in word_tokenize(sent) if i.isalpha()]
			
 
				+    sentences.append(x)
			
 
				+    check2=False
			
 
				+    for word in x:
			
 
				+        if check:
			
 
				+            check=False
			
 
				+            continue
			
 
				+        tmp2=tmp+word
			
 
				+        tmp = word + ' '
			
 
				+        if (tmp2 in twowords) and (tmp2 not in word_set):
			
 
				+            if check2: word_set.pop()
			
 
				+            word_set.append(tmp2)
			
 
				+            check = True
			
 
				+            check2=False
			
 
				+        elif (word in word_set) or (word in stopwords): continue
			
 
				+        else:
			
 
				+            word_set.append(word)
			
 
				+            check2=True
			
 
				+with open('result.txt', 'w', encoding='utf8') as json_file:
			
 
				+    json.dump(word_set, json_file, ensure_ascii=False)
			
 
				+json_file.close()
			
 
				+#print(len(word_set))
			
 
				+
			
 
				+#Set of vocab
			
 
				+word_set = set(word_set)
			
 
				+#Total documents in our corpus
			
 
				+total_documents = len(sentences)
			
 
				+
			
 
				+#Creating an index for each word in our vocab.
			
 
				+index_dict = {} #Dictionary to store index for each word
			
 
				+i = 0
			
 
				+for word in word_set:
			
 
				+    index_dict[word] = i
			
 
				+    i += 1
			
 
				+#Creating word_count
			
 
				+word_count = count_dict(sentences)
			
 
				+with open('count_dict.txt', 'w', encoding='utf8') as json_file:
			
 
				+    json.dump(word_count, json_file, ensure_ascii=False)
			
 
				+json_file.close()
			
 
				+
			
 
				+#TF-IDF Encoded text corpus
			
 
				+vectors = []
			
 
				+for sent in sentences:
			
 
				+    vec = tf_idf(sent)
			
 
				+    vectors.append(vec)
			
 
				+
			
 
				+#Creat user-vector
			
 
				+tf_words = {}
			
 
				+vec_user = []
			
 
				+total_words = 0
			
 
				+for sent in sentences:
			
 
				+    total_words += len(sent)
			
 
				+
			
 
				+avg=0
			
 
				+jac_user_vec=[]
			
 
				+
			
 
				+for word in word_set:
			
 
				+  tf_words[word] = 0
			
 
				+  for sent in sentences:
			
 
				+    if word in sent:
			
 
				+      for tmp_word in sent:
			
 
				+        if (word==tmp_word):
			
 
				+            tf_words[word] += 1/total_words
			
 
				+
			
 
				+  avg+=tf_words[word]/len(word_set)
			
 
				+
			
 
				+set(word_set)
			
 
				+
			
 
				+with open('tf_word.txt', 'w', encoding='utf8') as json_file:
			
 
				+    json.dump(tf_words, json_file, ensure_ascii=False)
			
 
				+json_file.close()
			
 
				+
			
 
				+for x,y in tf_words.items():
			
 
				+  if y>=avg: jac_user_vec.append(1)
			
 
				+  else: jac_user_vec.append(0)
			
 
				+  vec_user.append(y)
			
 
				+
			
 
				+#print(vec_user)
			
 
				+
			
 
				+#*********************************************************************************************
			
 
				+#Testing
			
 
				+
			
 
				+'''with open("testingrss.txt") as fii:
			
 
				+  thethao = fii.readlines()'''
			
 
				+
			
 
				+res_cos={}
			
 
				+link_title={}
			
 
				+
			
 
				+for i in range(len(news)):
			
 
				+    sent=news[i]['content']
			
 
				+    x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
			
 
				+    sentences.append(x)
			
 
				+
			
 
				+    t_vec=tf_idf(x)
			
 
				+
			
 
				+    jac_t_vec=[]
			
 
				+    avg=0
			
 
				+    for value in t_vec:
			
 
				+        avg+=value/len(t_vec)
			
 
				+    for x in t_vec:
			
 
				+        if x>=avg: jac_t_vec.append(1)
			
 
				+        else: jac_t_vec.append(0)
			
 
				+
			
 
				+
			
 
				+    A = jac_user_vec
			
 
				+    B = jac_t_vec
			
 
				+    val=cos_cal(t_vec,vec_user)
			
 
				+    res_cos[news[i]['title']]=val
			
 
				+    link_title[news[i]['title']]=news[i]['link']
			
 
				+    sentences.pop()
			
 
				+
			
 
				+res_cos={k: v for k, v in sorted(res_cos.items(), key=lambda item: item[1], reverse=True)}
			
 
				+#res_jac={k: v for k, v in sorted(res_jac.items(), key=lambda item: item[1], reverse=True)}
			
 
				+'''
			
 
				+for key in res_cos:
			
 
				+    print(key, ' : ', res_cos[key],'\n')
			
 
				+
			
 
				+print('----------------------- \n')
			
 
				+
			
 
				+"""
			
 
				+for key in res_jac:
			
 
				+    print(key, ' : ', res_jac[key],'\n')
			
 
				+"""
			
 
				+
			
 
				+print("--- %s seconds ---" % (time.time() - start))
			
 
				+'''
			
 
				+print(link_title)
			
--- a/result.txt
+++ b/result.txt
--- a/resutl2.txt
+++ b/resutl2.txt
--- a/testinggui.py
+++ b/testinggui.py
@@ -0,0 +1,17 @@
 
				+from tkinter import *
			
 
				+import webbrowser
			
 
				+
			
 
				+def weblink(*args):
			
 
				+    index = lb.curselection()[0]
			
 
				+    item = lb.get(index)
			
 
				+    if 'https://' in item:
			
 
				+        webbrowser.open_new(item)
			
 
				+
			
 
				+
			
 
				+root = Tk()
			
 
				+lb = Listbox(root)
			
 
				+lb.bind('<<ListboxSelect>>', weblink)
			
 
				+for item in list_of_items:
			
 
				+    lb.insert(END, item)
			
 
				+lb.pack()
			
 
				+root.mainloop()
			
--- a/testingrss.txt
+++ b/testingrss.txt
--- a/tf_word.txt
+++ b/tf_word.txt
--- a/tmp.py
+++ b/tmp.py
@@ -0,0 +1,123 @@
 
				+import sys
			
 
				+import time
			
 
				+# from sys import platform
			
 
				+#!/usr/bin/python
			
 
				+
			
 
				+# -*- coding: utf8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
			
 
				+from pygologin.gologin import GoLogin
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.common.exceptions import NoSuchElementException
			
 
				+from selenium.common.exceptions import StaleElementReferenceException
			
 
				+import json
			
 
				+
			
 
				+from urllib.parse import urljoin
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+
			
 
				+'''gl = GoLogin({
			
 
				+    "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MmY3Yjk3NGQxZGNkYmJjYzA5ODUyODciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MmY3Y2E2OTgwZGRjMDU1YjliZTVlMjMifQ.__GwUyY80hIVJ8o2Ak0wntHYizNwWrm42h-k7q0xxJE",
			
 
				+    "profile_id": "62f7b974d1dcdb43cb985289",
			
 
				+    # "port": random_port
			
 
				+}'''
			
 
				+capa = DesiredCapabilities.CHROME
			
 
				+capa["pageLoadStrategy"] = "none"
			
 
				+
			
 
				+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
			
 
				+#debugger_address = gl.start()
			
 
				+chrome_options = Options()
			
 
				+chrome_options.add_experimental_option("useAutomationExtension", False)
			
 
				+chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
			
 
				+#chrome_options.add_experimental_option("debuggerAddress", debugger_address)
			
 
				+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options, desired_capabilities=capa)
			
 
				+#driver=webdriver.Chrome("/Users/nguyenductai/Downloads/chromedriver2")'''
			
 
				+# ----------------------------
			
 
				+
			
 
				+f=open("linksthethao.txt", "w")
			
 
				+
			
 
				+for i in range(2,3):
			
 
				+    url='https://thanhnien.vn/thoi-su/chinh-tri/?trang='+str(i)
			
 
				+    url1 = requests.get(url)
			
 
				+    soup = BeautifulSoup(url1.content, 'lxml')
			
 
				+    #items = soup.findAll('item')
			
 
				+
			
 
				+    for links in soup.findAll('article', {'class': "story"}):
			
 
				+        for a in links.findAll('a', {'class': "story__title cms-link"} ,href=True):
			
 
				+            f.write(a['href'])
			
 
				+            f.write('\n')
			
 
				+
			
 
				+    print(i,'\n')
			
 
				+
			
 
				+
			
 
				+"""    
			
 
				+url1=requests.get('https://vnexpress.net/rss/the-thao.rss')
			
 
				+soup=BeautifulSoup(url1.content, 'xml')
			
 
				+items=soup.find_all('item')
			
 
				+wait=WebDriverWait(driver,200)
			
 
				+
			
 
				+'''driver.get("https://vnexpress.net/neymar-mbappe-va-vu-penaltygate-2-0-4501139.html")
			
 
				+wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/h1')))
			
 
				+str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
			
 
				+str=str[:str.rfind('\n')]
			
 
				+str=str[:str.rfind('\n')]
			
 
				+str=str[:str.rfind('\n')]
			
 
				+print(str)'''
			
 
				+
			
 
				+i=0
			
 
				+for item in items:
			
 
				+    i+=1
			
 
				+    title=item.title.text
			
 
				+    link=item.link.text
			
 
				+    #print("Link: ", link, '\n\n')
			
 
				+    url2=requests.get(link)
			
 
				+    #---------
			
 
				+    t_soup=BeautifulSoup(url2.content,'lxml')
			
 
				+    for headline in t_soup.findAll('h1',{'class':'title-detail'}):
			
 
				+        f.write(headline.text)
			
 
				+        f.write('\n')
			
 
				+    for description in t_soup.findAll('p',{'class':'description'}):
			
 
				+        f.write(description.text)
			
 
				+        f.write('\n')
			
 
				+    str=''
			
 
				+    for normal in t_soup.findAll('p', {'class': 'Normal'}):
			
 
				+        str+=normal.text+'\n'
			
 
				+
			
 
				+    str = str[:str.rfind('\n')]
			
 
				+    str = str[:str.rfind('\n')]
			
 
				+    str+='\n'
			
 
				+    f.write(str)
			
 
				+        #print('\n')
			
 
				+    print(i,'\n')
			
 
				+
			
 
				+
			
 
				+    #print(t_soup)
			
 
				+    #-----------
			
 
				+    '''driver.get(link)
			
 
				+    time.sleep(1)
			
 
				+    wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/article')))
			
 
				+    str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
			
 
				+    str = str[:str.rfind('\n')]
			
 
				+    str = str[:str.rfind('\n')]
			
 
				+    str = str[:str.rfind('\n')]
			
 
				+    str+='\n'
			
 
				+    f.write(str)
			
 
				+    print(i)
			
 
				+    #driver.execute_script("window.stop();")
			
 
				+    driver.refresh()'''
			
 
				+    #-------------
			
 
				+
			
 
				+"""
			
 
				+f.close()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/vietdict.txt
+++ b/vietdict.txt
--- a/vietnamese-stopwords.txt
+++ b/vietnamese-stopwords.txt