Browse Source

push all files

Nguyen Duc Tai 2 years ago
commit
270378b817

+ 8 - 0
.gitignore

@@ -0,0 +1,8 @@
+.idea
+transformers
+pygubu
+pygologin
+gologincrawl
+Crawl_Baomoi/source
+Crawl_Baomoi/baomoi2.txt
+Crawl_Baomoi/baomoi2_tmp.txt

+ 3 - 0
.inputrc

@@ -0,0 +1,3 @@
+set completion-ignore-case on
+set show-all-if-ambiguous on
+TAB: menu-complete

File diff suppressed because it is too large
+ 10 - 0
Crawl_Baomoi/baomoi_testing_crawling.txt


+ 65 - 0
Crawl_Baomoi/crawl_from_Baomoi.py

@@ -0,0 +1,65 @@
+import sys
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+
+from bs4 import BeautifulSoup
+import requests
+import time
+import json
+
+start=time.time()
+
+fi=open("links2.txt","r")
+fo=open("","a")
+
+news={}
+#news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}
+
+i=0
+
+start=time.time()
+
+for line in fi.readlines():
+    i+=1
+    if (i<69878): continue
+    link=line.strip()
+    try:
+        url=requests.get(link)
+        if url.history: continue
+        t_soup = BeautifulSoup(url.text, 'lxml')
+        t_content=""
+        for title in t_soup.findAll('h1', {'class': 'bm_J'}):
+            t_title=title.text
+        for description in t_soup.findAll('h3', {'class': 'bm_Ak bm_J'}):
+            t_description = description.text
+        for date in t_soup.findAll('time'):
+            if date.has_attr('datetime'):
+                t_date=date['datetime']
+        for category in t_soup.findAll('a', {'class': 'bm_y'}):
+            t_category=category.text
+        for content in t_soup.findAll('p', {'class': 'bm_Y'}):
+            t_content+=content.text+" "
+        for content in t_soup.findAll('p', {'class': 'bm_Y bm_FP'}):
+            t_content+=content.text+" "
+        news = {'title': t_title, 'description': t_description, 'content': t_content, 'category': t_category, 'date': t_date}
+        fo.write(json.dumps(news, ensure_ascii=False))
+        fo.write('\n')
+        print(i)
+    except:
+        print("Error!")
+
+fi.close()
+fo.close()
+
+print("--- %s seconds ---" % (time.time() - start))
+
+
+
+
+
+
+
+

+ 226 - 0
Crawl_Baomoi/crawl_from_others.py

@@ -0,0 +1,226 @@
+import sys
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from pygologin.gologin import GoLogin
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+
+from underthesea import ner
+from bs4 import BeautifulSoup
+import requests
+
+
+from datetime import datetime
+from datetime import timedelta
+
+gl = GoLogin({
+    "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzJjMjliMjJlMjIxZjVlMjc5Yzc4ZTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzJjMmI3OTlmYjIxNDI0YTFmNTQzZTUifQ.GR4iJFqUVRuI3XO_Ns3cfiII2m8CactTGU9jhNaSf-k",
+    "profile_id": "632c5184cef566f424ef2e3c",
+    # "port": random_port
+})
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+debugger_address = gl.start()
+chrome_options = Options()
+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+# ----------------------------
+
+
+'''
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
+chrome_options = Options()
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+'''
+
+start=time.time()
+
+# ----------some def------------------
+def find_first_link():
+    for tmp in driver.find_elements(By.TAG_NAME,'a'):
+        extracted_link=tmp.get_attribute("href")
+        if (extracted_link!=None):
+            if (extracted_link.find("https://"+site+"/")==0):
+                print(extracted_link)
+
+def create_link(site):
+    link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
+    date_from=date-timedelta(days=1)
+    date_to=date+timedelta(days=1)
+    year_from=date_from.strftime("%Y")
+    year_to=date_to.strftime("%Y")
+    month_from=date_from.strftime("%m")
+    month_to=date_to.strftime("%m")
+    day_from=date_from.strftime("%d")
+    day_to=date_to.strftime("%d")
+    tmp = 'min%3A'+month_from+'%2F'+day_from+'%2F'+year_from+'%2Ccd_max%3A'+month_to+ '%2F'+ day_to+  '%2F'+ year_to+  '&tbm='
+    return link+tmp
+
+def crawl(link,site):
+
+    news = {}
+    t_title = ""
+    t_description = ""
+    t_contents = ''
+    url = requests.get(link)
+    t_soup = BeautifulSoup(url.text, 'lxml')
+
+    if (site=="thanhnien.vn"):
+        for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('div', {'class': 'cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+    
+    if (site=="vnexpress.net"):
+        for title in t_soup.findAll('h1', {'class': 'title-detail'}):
+            t_title=title.text
+        for description in t_soup.findAll('p', {'class': 'description'}):
+            t_description=description.text
+        for contents in t_soup.findAll('p', {'class': 'Normal'}):
+            t_contents+=contents.text+". "
+
+    if (site=="tienphong.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('td', {'class': 'caption'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+
+    if (site=="vov.vn"):
+        for title in t_soup.findAll('div', {'class': 'row article-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'row article-summary'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'row article-content'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('td'):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+
+    if (site=="nhandan.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+        for contents in t_soup.findAll('td', {'class': 'caption'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site=="zingnews.vn"):
+        for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site=="tuoitre.vn"):
+
+        for title in t_soup.findAll('h1', {'class': 'article-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'content fck'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+
+    news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
+    if t_title=="": return {}
+    return news
+
+#-----------------------
+sites={'vnexpress.net','thanhnien.vn','tienphong.vn','vov.vn','nhandan.vn','zingnews.vn','tuoitre.vn'}
+
+fi=open("baomoi_testing_crawling.txt","r")
+fo=open("testing.txt",'w')
+i=0
+#--------------------------
+for line in fi.readlines():
+    a=json.loads(line)
+    t_str=ner(a["title"])
+    #---
+    t_date=a["date"]
+    year=int(t_date[0:4])
+    month=int(t_date[5:7])
+    day=int(t_date[8:10])
+    date=datetime(year,month,day)
+    #---
+    searching_key= ''
+    for words in t_str:
+        if (words[1]=="N") or (words[1]=="Np"):
+            searching_key+= '"' + words[0] + '"' + "%2B"
+    searching_key=searching_key.replace(" ", "+")
+    searching_key= searching_key[0:len(searching_key) - 3]
+    source = [line]
+    for site in sites:
+        #print(create_link(site))
+        check_link=create_link(site)
+        driver.get(check_link)
+        time.sleep(0.5)
+        #print(check_link)
+        #print(driver.current_url)
+        while (driver.current_url.find("https://www.google.com/search")==-1):
+            driver.delete_all_cookies()
+            driver.refresh()
+            driver.get(check_link)
+            time.sleep(0.5)
+        #print("#---------------------"+"\n")
+        '''
+        while (driver.current_url!=check_link):
+            driver.delete_all_cookies()
+            driver.refresh()
+            driver.get(check_link)
+        '''
+        #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
+        for tmp in driver.find_elements(By.TAG_NAME, 'a'):
+            extracted_link = tmp.get_attribute("href")
+            if (extracted_link != None):
+                if (extracted_link.find("https://" + site + "/") == 0):
+                    #print(extracted_link)
+                    print(extracted_link)
+                    news = crawl(extracted_link,site)
+                    if news!={}:
+                        source.append(news)
+                    break
+        #time.sleep(2)
+        #crawling(create_link(site))
+        # print(source)
+
+    fo.write(json.dumps(source, ensure_ascii=False))
+    fo.write('\n')
+
+
+print("--- %s seconds ---" % (time.time() - start))
+
+
+
+
+
+
+
+

+ 423 - 0
Crawl_Baomoi/crawl_from_others_tesing_google.py

@@ -0,0 +1,423 @@
+from sys import platform
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from pygologin.gologin import GoLogin
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+
+from underthesea import ner
+from bs4 import BeautifulSoup
+import requests
+
+
+from datetime import datetime
+from datetime import timedelta
+
+proxy_list=[]
+for line in open("proxylist.txt","r"):
+    proxy_list.append(line.split(":"))
+proxy_check=[]
+
+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzU4ZGExYTMyMzA4NDUzNDYwYjMwOTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzU4ZGEzNDM5OGJmNTFkM2IyMjc5OTQifQ.8LBET_Bp0BK7W7nCafDQD1BV3nKkmKIXA7iltU0z0VA"
+
+gl = GoLogin({
+	"token": TOKEN,
+    'tmpdir':"/tmp/",
+    "local":True,
+    "credentials_enable_service": False,
+})
+
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+debugger_address = gl.start()
+chrome_options = Options()
+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+
+
+"""
+profile_id = gl.create({
+    "name": 'profile_1',
+    "os": 'mac',
+    "proxyEnabled": True,
+    "navigator": {
+        "language": 'en-US,en;q=0.9,he;q=0.8',
+        "userAgent": 'MyUserAgent',
+        "resolution": '1024x768',
+        "platform": 'darwin',
+    },
+    "proxy":{
+        'mode': 'http',
+        'host': host,
+        'port': port,
+        'username': "prateep6793",
+        'password': "Zing1234",
+    }
+});
+
+"""
+
+def creat_new_profile_id(gl, i):
+    host=proxy_list[i][0]
+    port=proxy_list[i][1]
+    profile_id = gl.create({
+        "name": 'profile_1',
+        "os": 'mac',
+        "proxyEnabled": True,
+        "navigator": {
+            "language": 'en-US,en;q=0.9,he;q=0.8',
+            "userAgent": 'MyUserAgent',
+            "resolution": '1024x768',
+            "platform": 'darwin',
+        },
+        "proxy":{
+            'mode': 'http',
+            'host': host,
+            'port': port,
+            'username': "prateep6793",
+            'password': "Zing1234",
+        }
+    });
+    return profile_id
+
+def clear_proxy_list(gl,driver):
+    i=0
+    while (i<len(proxy_list)):
+        try:
+            profile_id=creat_new_profile_id(gl,i)
+            gl = GoLogin({
+                "token": TOKEN,
+                'profile_id': profile_id,
+            })
+            chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+            debugger_address = gl.start()
+            chrome_options = Options()
+            chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+            driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+            proxy_check[i]=True
+        except:
+            print("Error Proxy!")
+            proxy_check[i]=False
+        i+=1
+
+
+
+# ----------------------------
+
+
+'''
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
+chrome_options = Options()
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+'''
+
+start=time.time()
+print("ok")
+# ----------some def------------------
+def find_first_link():
+    for tmp in driver.find_elements(By.TAG_NAME,'a'):
+        extracted_link=tmp.get_attribute("href")
+        if (extracted_link!=None):
+            if (extracted_link.find("https://"+site+"/")==0):
+                print(extracted_link)
+
+def create_link(site):
+    link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
+    date_from=date-timedelta(days=1)
+    date_to=date+timedelta(days=1)
+    year_from=date_from.strftime("%Y")
+    year_to=date_to.strftime("%Y")
+    month_from=date_from.strftime("%m")
+    month_to=date_to.strftime("%m")
+    day_from=date_from.strftime("%d")
+    day_to=date_to.strftime("%d")
+    tmp = "&tbs=cdr:1,cd_min:"+month_to+"/"+day_to+"/"+year_to+",cd_max:"+month_from+"/"+day_from+"/"+year_from
+    #print(link+tmp)
+    return link+tmp
+
+def crawl(link,site):
+
+    news = {}
+    t_title = ""
+    t_description = ""
+    t_contents = ''
+    url = requests.get(link)
+    t_soup = BeautifulSoup(url.text, 'lxml')
+
+    if (site=="thanhnien.vn"):
+        for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('div', {'class': 'cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+
+    if (site=="vnexpress.net"):
+        for title in t_soup.findAll('h1', {'class': 'title-detail'}):
+            t_title=title.text
+        for description in t_soup.findAll('p', {'class': 'description'}):
+            t_description=description.text
+        for contents in t_soup.findAll('p', {'class': 'Normal'}):
+            t_contents+=contents.text+". "
+
+    if (site=="tienphong.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('td', {'class': 'caption'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+
+    if (site=="vov.vn"):
+        for title in t_soup.findAll('div', {'class': 'row article-title'}):
+            t_title=title.text
+        for description in t_soup.findAll('div', {'class': 'row article-summary'}):
+            t_description=description.text
+        for contents in t_soup.findAll('div', {'class': 'row article-content'}):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+        for contents in t_soup.findAll('td'):
+            for content in contents.findAll('p'):
+                t_contents+=content.text+". "
+
+    if (site=="nhandan.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+        for contents in t_soup.findAll('td', {'class': 'caption'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site=="zingnews.vn"):
+        for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site=="tuoitre.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'content fck'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site=="cand.com.vn"):
+        for title in t_soup.findAll('h1', {'class': 'box-title-detail entry-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'box-des-detail this-one'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'detail-content-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "vtv.vn"):
+        for title in t_soup.findAll('h1', {'class': 'title_detail'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'ta-justify'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+                tmp = len(content.text + ". ")
+
+    if (site == "24h.com.vn"):
+        for title in t_soup.findAll('h1', {'class': 'clrTit bld tuht_show'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'ctTp tuht_show'}):
+            t_description = description.text
+        for contents in t_soup.findAll('article', {'class': 'nwsHt nwsUpgrade'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "dantri.com.vn"):
+        for title in t_soup.findAll('h1', {'class': 'title-page detail'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'singular-sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'singular-content'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "baophapluat.vn"):
+        for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+        for contents in t_soup.findAll('td', {'class': 'caption'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "kenh14.vn"):
+        for title in t_soup.findAll('h1', {'class': 'kbwc-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'knc-sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'knc-content'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "laodong.vn"):
+        for title in t_soup.findAll('h1', {'class': 'title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'chappeau'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'art-body'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "qdnd.vn"):
+        for title in t_soup.findAll('h1', {'class': 'post-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('div', {'class': 'post-summary'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'post-content'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "vtc.vn"):
+        for title in t_soup.findAll('h1', {'class': 'font28 bold lh-1-3'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'font18 bold inline-nb'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'class': 'edittor-content box-cont mt15 clearfix '}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+    if (site == "toquoc.vn"):
+        for title in t_soup.findAll('h1', {'class': 'entry-title'}):
+            t_title = title.text
+        for description in t_soup.findAll('h2', {'class': 'sapo'}):
+            t_description = description.text
+        for contents in t_soup.findAll('div', {'data-role': 'content'}):
+            for content in contents.findAll('p'):
+                t_contents += content.text + ". "
+
+
+
+
+    news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
+    if t_title=="": return {}
+    return news
+
+#-----------------------
+sites={'vnexpress.net','thanhnien.vn','tienphong.vn',
+       'vov.vn','nhandan.vn','zingnews.vn',
+       'tuoitre.vn','cand.com.vn','vtv.vn',
+       '24h.com.vn','dantri.com.vn','baophapluat.vn',
+       'kenh14.vn','laodong.vn','qdnd.vn','vtc.vn',
+       'toquoc.vn'}
+
+fi=open("baomoi_testing_crawling.txt","r")
+fo=open("testing.txt",'w')
+#--------------------------
+
+clear_proxy_list()
+
+for line in fi.readlines():
+    a=json.loads(line)
+    t_str=ner(a["title"])
+    #---
+    t_date=a["date"]
+    year=int(t_date[0:4])
+    month=int(t_date[5:7])
+    day=int(t_date[8:10])
+    date=datetime(year,month,day)
+    #---
+    searching_key= ''
+    for words in t_str:
+        if (words[1]=="N") or (words[1]=="Np"):
+            searching_key+= '"' + words[0] + '"' + "%2B"
+    searching_key=searching_key.replace(" ", "+")
+    searching_key= searching_key[0:len(searching_key) - 3]
+    source = [line]
+    for site in sites:
+        #print(create_link(site))
+        check_link=create_link(site)
+        driver.get(check_link)
+        time.sleep(1)
+        #print(check_link)
+        if (driver.current_url.find("sorry")!=-1):
+            print("Error!")
+        else:
+            print(driver.current_url)
+        """
+            try:
+                i+=1
+                driver.close()
+                gl = GoLogin({
+                    "token": TOKEN,
+                    'profile_id': creat_new_profile_id(gl,i),
+                })
+                debugger_address = gl.start()
+                chrome_options = Options()
+                chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+                driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+
+                driver.get(check_link)
+            except:
+                pass
+
+        #print("#---------------------"+"\n")
+        '''
+        while (driver.current_url!=check_link):
+            driver.delete_all_cookies()
+            driver.refresh()
+            driver.get(check_link)
+        '''
+        """
+        #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
+        for tmp in driver.find_elements(By.TAG_NAME, 'a'):
+            extracted_link = tmp.get_attribute("href")
+            if (extracted_link != None):
+                if (extracted_link.find("https://" + site + "/") == 0):
+                    #print(extracted_link)
+                    print(extracted_link)
+                    news = crawl(extracted_link,site)
+                    if news!={}:
+                        source.append(news)
+                    break
+        #time.sleep(2)
+        #crawling(create_link(site))
+        # print(source)
+
+    fo.write(json.dumps(source, ensure_ascii=False))
+    fo.write('\n')
+
+
+print("--- %s seconds ---" % (time.time() - start))
+
+
+
+
+
+
+
+

+ 90 - 0
Crawl_Baomoi/crawl_links.py

@@ -0,0 +1,90 @@
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+
+from bs4 import BeautifulSoup
+import bs4
+import requests
+from urllib.parse import urljoin
+import time
+
+base = 'https://baomoi.com/'
+
+#start=time.time()
+
+f=open("links2.txt","w")
+
+
+
+def scraping_link(link):
+    url = requests.get(link)
+    soup = BeautifulSoup(url.content, 'lxml')
+    for links in soup.findAll('div',{'class':'bm_O'}):
+        for a in links.findAll('a', href=True):
+            f.write(urljoin(base,a['href']))
+            f.write('\n')
+
+
+
+for i in range(1,168):
+    start = time.time()
+    print(i)
+    #scraping_link("https://baomoi.com/tin-moi/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/the-gioi/trang+"+str(i)+".epi")
+    #----
+    scraping_link("https://baomoi.com/thoi-su/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/giao-thong/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/moi-truong-khi-hau/trang+"+str(i)+".epi")
+    #----
+    scraping_link("https://baomoi.com/nghe-thuat/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/am-thuc/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/du-lich/trang+"+str(i)+".epi")
+    #----
+    scraping_link("https://baomoi.com/lao-dong-viec-lam/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/tai-chinh/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/chung-khoan/trang+"+str(i)+".epi")
+    scraping_link("https://baomoi.com/kinh-doanh/trang+"+str(i)+".epi")
+    #-----
+    scraping_link("https://baomoi.com/hoc-bong-du-hoc/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/dao-tao-thi-cu/trang+" + str(i) + ".epi")
+    #-----
+    scraping_link("https://baomoi.com/bong-da-quoc-te/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/bong-da-viet-nam/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/quan-vot/trang+" + str(i) + ".epi")
+    #---
+    scraping_link("https://baomoi.com/am-nhac/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/thoi-trang/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/dien-anh-truyen-hinh/trang+" + str(i) + ".epi")
+    #---
+    scraping_link("https://baomoi.com/an-ninh-trat-tu/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/hinh-su-dan-su/trang+" + str(i) + ".epi")
+    # ---
+    scraping_link("https://baomoi.com/cntt-vien-thong/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/thiet-bi-phan-cung/trang+" + str(i) + ".epi")
+    # ---
+    scraping_link("https://baomoi.com/khoa-hoc/trang+" + str(i) + ".epi")
+    # ---
+    scraping_link("https://baomoi.com/dinh-duong-lam-dep/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/tinh-yeu-hon-nhan/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/suc-khoe-y-te/trang+" + str(i) + ".epi")
+    # ---
+    scraping_link("https://baomoi.com/xe-co/trang+" + str(i) + ".epi")
+    # ---
+    scraping_link("https://baomoi.com/quan-ly-quy-hoach/trang+" + str(i) + ".epi")
+    scraping_link("https://baomoi.com/khong-gian-kien-truc/trang+" + str(i) + ".epi")
+    print("--- %s seconds ---" % (time.time() - start))
+f.close()
+
+
+
+
+
+
+
+
+
+
+
+

+ 35 - 0
Crawl_Baomoi/extract_data.py

@@ -0,0 +1,35 @@
+from underthesea import ner
+from underthesea import pos_tag
+from underthesea import word_tokenize
+import json
+from bs4 import BeautifulSoup
+import requests
+
+
+fi=open("baomoi_testing_crawling.txt","r")
+
+i=0
+
+for line in fi.readlines():
+    a=json.loads(line)
+    t_str=ner(a["title"])
+    #---
+    t_date=a["date"]
+    year=t_date[0:4]
+    month=t_date[5:7]
+    day=t_date[8:10]
+    #---
+    print(t_str)
+    searching_key= ''
+    for words in t_str:
+        if (words[1]=="N") or (words[1]=="Np"):
+            searching_key+= '"' + words[0] + '"' + "%2B"
+    searching_key=searching_key.replace(" ", "+")
+    searching_key= searching_key[0:len(searching_key) - 3]
+    '''
+    print(searching_key)
+    print(year,' ',month,' ',day)
+    print(i)
+    '''
+    i+=1
+    if (i==2): break

+ 300 - 0
Crawl_Baomoi/freeproxylist.txt

@@ -0,0 +1,300 @@
+23.238.33.186:80
+47.254.47.61:8080
+117.54.114.98:80
+80.48.119.28:8080
+117.103.163.12:8080
+116.58.166.194:8080
+118.26.110.48:8080
+138.68.235.51:80
+198.49.68.80:80
+169.57.1.85:8123
+51.15.242.202:8888
+141.95.122.232:80
+82.210.8.173:80
+146.196.48.2:80
+219.78.228.211:80
+83.229.73.175:80
+37.53.103.4:3128
+38.242.204.153:7070
+200.69.210.59:80
+155.133.71.16:8080
+37.112.29.73:55443
+85.10.199.48:80
+47.91.56.120:8080
+23.137.139.61:3129
+47.252.1.180:3128
+161.97.126.37:8118
+20.111.54.16:80
+124.13.181.4:80
+3.1.248.232:80
+103.234.200.254:80
+165.154.225.65:80
+129.154.54.57:3128
+45.79.94.19:80
+111.251.185.158:80
+111.250.22.44:80
+103.172.116.231:80
+172.105.107.25:999
+155.138.197.162:80
+200.103.102.18:80
+45.229.34.174:999
+216.137.184.253:80
+20.210.113.32:8123
+104.248.194.178:80
+116.203.199.47:8080
+165.232.149.87:8888
+201.91.18.82:8000
+185.15.172.212:3128
+143.198.77.180:80
+177.82.85.209:3128
+47.241.165.133:443
+118.70.186.173:4007
+181.49.100.190:8080
+185.237.99.218:61443
+104.225.220.233:80
+51.103.137.65:80
+54.66.104.168:80
+167.99.236.14:80
+178.128.122.245:80
+197.243.20.178:80
+194.195.240.60:8080
+149.129.213.200:8080
+97.74.92.60:80
+80.66.81.40:8080
+103.155.196.22:8181
+182.253.235.63:8080
+128.199.202.122:8080
+110.238.111.229:8080
+107.172.73.179:7890
+20.206.106.192:8123
+72.169.67.85:87
+195.31.137.5:80
+66.175.223.147:4153
+198.11.175.192:8080
+206.189.146.13:8080
+139.99.237.62:80
+45.33.12.251:8080
+192.53.163.144:3128
+110.34.3.229:3128
+47.74.152.29:8888
+83.229.72.174:80
+37.120.192.154:8080
+173.255.209.155:1080
+105.16.115.202:80
+110.164.3.7:8888
+18.207.107.60:80
+198.11.175.180:8080
+20.24.43.214:8123
+52.88.105.39:80
+167.71.230.124:8080
+193.122.71.184:3128
+64.227.23.88:8118
+154.239.1.77:1981
+187.217.54.84:80
+165.154.226.12:80
+54.36.239.180:5000
+151.181.91.10:80
+121.181.111.191:8001
+49.207.36.81:80
+172.105.231.110:80
+193.141.65.48:808
+213.230.66.38:8080
+139.59.88.145:8888
+104.148.36.10:80
+79.111.13.155:50625
+103.166.28.12:8181
+72.169.67.61:87
+103.17.246.148:8080
+93.100.118.135:8080
+87.247.187.9:3128
+93.180.135.243:3128
+195.138.90.226:3128
+157.100.56.181:999
+177.53.153.14:999
+36.94.183.153:8080
+41.254.53.70:1976
+103.177.20.148:8181
+43.154.216.109:80
+159.65.63.209:8888
+158.69.71.245:9300
+158.69.53.98:9300
+176.58.112.123:1080
+172.105.184.208:8001
+103.152.112.145:80
+89.107.197.165:3128
+47.91.44.217:8000
+154.236.184.71:1974
+117.54.114.35:80
+198.59.191.234:8080
+180.232.123.251:3128
+178.79.138.253:8080
+3.212.9.208:80
+5.254.34.4:3129
+82.223.102.92:9443
+50.233.228.147:8080
+213.214.74.90:80
+72.55.155.80:80
+44.204.198.120:80
+8.209.68.1:8080
+121.181.40.87:8001
+194.35.127.130:80
+68.183.242.248:3128
+157.245.207.186:8080
+52.200.191.158:80
+20.187.77.5:80
+43.154.233.149:8080
+74.205.128.200:80
+35.86.232.240:80
+201.229.250.22:8080
+167.99.174.59:80
+14.139.242.7:80
+1.224.3.122:3888
+162.144.233.16:80
+203.198.207.253:80
+41.32.12.190:80
+130.185.122.169:80
+157.100.26.69:80
+143.198.182.218:80
+51.83.98.90:80
+188.235.0.207:8282
+31.220.183.217:53281
+41.188.149.79:80
+8.209.64.208:8080
+92.119.59.241:80
+112.120.41.171:80
+65.108.145.78:8080
+149.129.254.140:80
+103.168.129.123:8080
+202.180.20.66:8080
+103.154.230.129:8080
+195.133.49.95:3128
+103.17.213.98:8080
+45.233.67.226:999
+138.117.110.87:999
+45.56.83.46:8012
+195.201.30.206:5566
+103.141.52.218:8000
+100.20.122.18:80
+3.138.46.196:80
+117.54.114.99:80
+20.81.62.32:3128
+149.129.184.250:8080
+159.138.169.48:8080
+95.216.88.150:3128
+180.94.69.66:8080
+117.54.114.102:80
+115.243.88.49:80
+193.233.210.84:8085
+73.32.226.94:8118
+23.95.49.244:3128
+193.233.229.112:8085
+193.168.182.98:8800
+85.208.210.66:8085
+23.94.238.204:3128
+149.18.30.166:8085
+138.2.64.185:8118
+173.212.195.139:80
+51.159.207.156:3128
+135.125.1.230:80
+51.250.80.131:80
+8.209.246.6:80
+58.27.59.249:80
+104.45.128.122:80
+138.91.159.185:80
+209.97.152.208:8888
+34.81.72.31:80
+34.223.105.122:80
+100.20.101.185:80
+68.183.143.134:80
+43.255.113.232:8082
+65.21.131.27:80
+85.133.229.10:8080
+173.249.25.220:80
+198.13.54.14:80
+134.209.90.106:80
+143.198.40.24:8888
+157.230.97.17:8888
+201.217.49.2:80
+174.138.16.96:8888
+205.207.103.97:8282
+189.82.62.163:8080
+185.130.80.81:8080
+138.117.231.130:999
+200.71.109.237:999
+124.105.75.122:8080
+117.240.28.81:8080
+138.59.187.10:666
+135.181.29.13:3128
+45.149.41.237:41890
+194.1.250.56:8080
+102.68.128.212:8080
+87.246.54.221:8888
+51.91.56.181:3128
+139.59.228.95:8118
+47.56.69.11:8000
+162.0.226.218:80
+45.79.208.64:44554
+103.103.52.40:44116
+43.205.33.122:80
+149.129.239.170:8080
+110.238.109.146:8080
+222.252.156.61:62694
+31.200.229.104:56471
+178.63.133.25:80
+116.0.61.122:3128
+93.191.96.4:3128
+201.238.248.139:9229
+186.193.246.32:8080
+202.152.51.44:8080
+187.251.107.143:8080
+103.142.241.142:3127
+103.121.120.69:8080
+45.235.46.94:8080
+78.83.199.235:53281
+88.218.17.112:3129
+119.18.152.210:3127
+103.146.30.178:8080
+173.196.205.170:8080
+103.144.18.67:8082
+181.143.224.43:999
+188.132.222.4:8080
+202.77.115.69:8182
+102.222.146.203:8080
+114.79.146.137:8080
+212.108.144.67:8080
+103.122.64.234:3125
+103.83.179.150:8080
+110.164.162.42:8080
+206.62.137.57:8080
+45.79.253.142:3128
+100.20.156.53:80
+3.226.168.144:80
+139.59.61.115:80
+103.106.193.137:7532
+162.243.174.235:80
+159.138.252.45:8080
+207.180.250.238:80
+103.171.182.230:8080
+134.122.58.174:80
+81.94.255.12:8080
+121.139.218.165:31409
+92.205.22.114:38080
+47.254.237.222:8080
+13.81.217.201:80
+54.246.207.78:80
+52.47.137.181:80
+200.105.215.18:33630
+47.245.34.161:8080
+202.180.20.11:55443
+47.253.105.175:5566
+121.151.97.238:8001
+117.54.114.97:80
+193.56.118.205:443
+45.79.90.143:44554
+35.84.133.18:80
+54.175.197.235:80
+213.230.97.98:3128
+110.238.74.184:8080
+139.59.43.194:3128
+178.115.253.35:8080
+41.76.221.33:8088

+ 42 - 0
Crawl_Baomoi/get_random_free_proxy.py

@@ -0,0 +1,42 @@
+import requests
+import random
+from bs4 import BeautifulSoup as bs
+import traceback
+import time
+
+def get_free_proxies():
+    url = "https://free-proxy-list.net/"
+    # request and grab content
+    soup = bs(requests.get(url).content, 'html.parser')
+    # to store proxies
+    proxies = []
+    for row in soup.find("table", attrs={"class": "table-striped"}).find_all("tr")[1:]:
+        tds = row.find_all("td")
+        try:
+            ip = tds[0].text.strip()
+            port = tds[1].text.strip()
+            proxies.append(str(ip) + ":" + str(port))
+        except IndexError:
+            continue
+    return proxies
+
+url = "http://ipinfo.io/json"
+proxies = get_free_proxies()
+print(proxies)
+
+def first_proxy(proxies):
+    while True:
+        proxy = proxies[random.randint(0,len(proxies)-1)]
+        try:
+            response = requests.get(url, proxies = {"http":"http://"+str(proxy), "https":"https://"+str(proxy)}, timeout=1)
+            print(response.json()['country'])
+            print(response.json()['region'])
+            print(response.text)
+            break
+        except:
+            pass
+            # if the proxy Ip is preoccupied
+            # print("Not Available")
+    return proxy
+
+print(first_proxy(proxies))

BIN
Crawl_Baomoi/gologin_zeroprofile.zip


File diff suppressed because it is too large
+ 325676 - 0
Crawl_Baomoi/links2.txt


File diff suppressed because it is too large
+ 325676 - 0
Crawl_Baomoi/links2_tmp.txt


+ 47 - 0
Crawl_Baomoi/proxylist.txt

@@ -0,0 +1,47 @@
+45.91.93.166:22413
+92.53.90.84:5031
+176.126.84.126:16432
+195.154.43.86:52110
+162.19.7.58:61711
+195.154.43.198:50288
+134.195.91.76:22613
+89.248.165.79:5478
+45.132.75.19:25543
+66.29.128.246:59446
+162.19.7.48:49901
+66.29.128.244:27264
+37.221.193.221:25830
+37.221.193.221:30479
+31.220.43.141:12823
+185.209.30.138:4035
+176.126.84.126:35711
+176.126.84.126:12305
+209.159.153.21:49169
+37.221.193.221:12833
+45.132.75.19:34380
+37.221.193.221:15098
+185.209.30.138:4064
+162.0.220.220:30537
+37.221.193.221:24987
+45.91.92.30:24636
+31.220.43.141:27203
+162.19.7.46:35983
+176.126.84.126:25188
+37.221.193.221:26486
+45.132.75.19:14733
+66.29.129.52:24365
+51.254.149.59:49884
+38.91.107.229:17261
+51.83.116.5:11820
+31.220.43.141:17452
+31.220.43.141:23578
+31.220.43.141:27172
+162.19.7.48:24982
+176.126.84.126:30183
+31.220.43.141:27357
+37.221.193.221:22340
+162.19.7.58:36635
+45.132.75.19:30709
+134.195.91.76:29261
+174.138.176.75:33861
+89.248.165.79:5480

File diff suppressed because it is too large
+ 2 - 0
Crawl_Baomoi/testing.txt


+ 105 - 0
Crawl_Baomoi/testing_scrape.py

@@ -0,0 +1,105 @@
+from bs4 import BeautifulSoup
+import requests
+
+from sys import platform
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from pygologin.gologin import GoLogin
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+
+from underthesea import ner
+from bs4 import BeautifulSoup
+import requests
+
+from googlesearch import *
+
+from datetime import datetime
+from datetime import timedelta
+"""
+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzNkZDJlOWYwMzIwMjBkYWQwNDU2ZTciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzNkZDM0YWM5OWFmMmMzMzdkMjNmNGQifQ.7UmxqoGmN25EwG1DmN-2aJZqbBUY3R4hgKJciKgUwRg"
+
+link="https://ipinfo.io/"
+
+gl = GoLogin({
+	"token": TOKEN,
+    'tmpdir':"/tmp/",
+    "local":True,
+    "credentials_enable_service": False,
+})
+
+profile_id = gl.create({
+    "name": 'profile_1',
+    "os": 'mac',
+    "proxyEnabled": True,
+    "navigator": {
+        "language": 'en-US,en;q=0.9,he;q=0.8',
+        "userAgent": 'MyUserAgent',
+        "resolution": '1024x768',
+        "platform": 'darwin',
+    },
+    "proxy":{
+        'mode': 'http',
+        'host': "139.99.237.62",
+        'port': "80",
+        'username': "",
+        'password': "",
+    }
+});
+'host': "139.99.237.62",
+        'port': ,
+        'username': "",
+        'password': "",
+
+
+
+gl = GoLogin({
+	"token": TOKEN,
+    'profile_id':profile_id,
+})
+
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+debugger_address = gl.start()
+chrome_options = Options()
+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+
+driver.get(link)
+gl.delete(profile_id)
+driver.close()
+print("end session!")
+# ----------------------------
+
+
+
+"""
+link="https://toquoc.vn/van-hoa-khong-co-su-cao-thap-nho-hay-lon-ma-chi-co-su-da-dang-net-dac-sac-tieu-bieu-can-duoc-ton-trong-ton-vinh-phat-huy-giu-gin-20221006225030042.htm"
+news = {}
+t_title = ""
+t_description = ""
+t_contents = ''
+url = requests.get(link)
+t_soup = BeautifulSoup(url.text, 'lxml')
+
+
+for title in t_soup.findAll('h1', {'class': 'entry-title'}):
+    t_title = title.text
+for description in t_soup.findAll('h2', {'class': 'sapo'}):
+    t_description = description.text
+for contents in t_soup.findAll('div', {'data-role': 'content'}):
+    for content in contents.findAll('p'):
+        t_contents += content.text + ". "
+
+
+
+news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
+print(news)

+ 100 - 0
Crawl_Baomoi/testing_tmp.py

@@ -0,0 +1,100 @@
+from sys import platform
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+import sys,os
+sys.path.append('../pygologin')
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from pygologin.gologin import GoLogin
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+
+from underthesea import ner
+from bs4 import BeautifulSoup
+import requests
+
+
+from datetime import datetime
+from datetime import timedelta
+
+
+TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzZiNjdiNTVhMTI5NzNmY2FiMzdlMTAiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzZiNjgwNDczY2QwZDFiNjNmYmM5YTIifQ.VfF22lLEMP3JSklvuWTgOfkxEKKHCcsSYQotg6zMcac"
+gl = GoLogin({
+    "token": TOKEN,
+    'profile_id': "636b67b55a12973346b37e12",
+})
+
+
+proxy_list=[]
+for line in open("proxylist.txt","r"):
+    line=line[:-1]
+    proxy_list.append(line.split(":"))
+proxy_check=[]
+
+
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+debugger_address = gl.start()
+chrome_options = Options()
+chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+driver.close()
+
+def creat_new_profile_id(gl, i):
+    host=proxy_list[i][0]
+    port=proxy_list[i][1]
+    profile_id = gl.create({
+        "name": 'profile_'+str(i),
+        "os": 'mac',
+        "proxyEnabled": True,
+        "navigator": {
+            "language": 'en-US,en;q=0.9,he;q=0.8',
+            "userAgent": 'MyUserAgent',
+            "resolution": '1024x768',
+            "platform": 'darwin',
+        },
+        "proxy":{
+            'mode': 'socks5',
+            'host': host,
+            'port': port,
+            'username': "prateep6793",
+            'password': "Zing1234",
+        }
+    });
+    return profile_id
+
+def clear_proxy_list(gl,driver):
+    for i in range(len(proxy_list)-1):
+        print(i)
+        try:
+            profile_id=creat_new_profile_id(gl,i)
+            gl = GoLogin({
+                "token": TOKEN,
+                'profile_id': profile_id,
+            })
+            chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
+            debugger_address = gl.start()
+            chrome_options = Options()
+            chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+            driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
+            proxy_check.append(True)
+            driver.get("https://www.google.com/")
+            print("Okay")
+        except:
+            print("Error Proxy!")
+            proxy_check.append(False)
+        time.sleep(2)
+        driver.refresh()
+        driver.close()
+        gl.delete(profile_id)
+
+
+clear_proxy_list(gl,driver)
+print(proxy_list)
+

+ 33 - 0
Crawl_Baomoi/testingscrape2.py

@@ -0,0 +1,33 @@
+import requests
+import random
+
+import requests
+from requests.auth import HTTPProxyAuth
+from bs4 import BeautifulSoup as bs
+import traceback
+#from proxybroker import checker
+
+url = "http://ipinfo.io/json"
+proxy_list=[]
+for line in open("proxylist.txt","r"):
+    line=line[:-1]
+    proxy_list.append(line)
+
+print(proxy_list)
+#checker.ProxyChecker()
+
+auth=HTTPProxyAuth("prateep6793","Zing1234")
+
+for i in range(len(proxy_list)):
+
+    #printing req number
+    print("Request Number : " + str(i+1))
+    proxy = proxy_list[i]
+    #print(proxy)
+    try:
+        response = requests.get(url, proxies = {"http":"socks5://"+proxy, "https":"socks5://"+proxy},auth=auth, timeout=10)
+        print("ok!")
+    except:
+        # if the proxy Ip is pre occupied
+        print("Not Available")
+

+ 1 - 0
README.md

@@ -0,0 +1 @@
+# ndtai

+ 39 - 0
RSStest.py

@@ -0,0 +1,39 @@
+from bs4 import BeautifulSoup
+import requests
+import time
+start=time.time()
+
+def crawl():
+    f = open("testingrss.txt", "w")
+
+    url = requests.get('https://vnexpress.net/rss/the-thao.rss')
+
+    soup = BeautifulSoup(url.content, 'xml')
+    items = soup.find_all('item')
+
+    for item in items:
+        title = item.title.text
+        print(title + '\n')
+    # -------------------------------------------
+    url = requests.get('https://vnexpress.net/rss/thoi-su.rss')
+
+    soup = BeautifulSoup(url.content, 'xml')
+    items = soup.find_all('item')
+
+    for item in items:
+        title = item.title.text
+        print(title + '\n')
+    # -------------------------------------------
+    url = requests.get('https://vnexpress.net/rss/giao-duc.rss')
+
+    soup = BeautifulSoup(url.content, 'xml')
+    items = soup.find_all('item')
+
+    for item in items:
+        title = item.title.text
+        print(title + '\n')
+    # ------------------
+    f.close()
+
+crawl()
+print("--- %s seconds ---" % (time.time() - start))

+ 78 - 0
Suggest_news.py

@@ -0,0 +1,78 @@
+import time
+import re
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+
+from bs4 import BeautifulSoup
+from bs4 import element
+
+import bs4
+import requests
+
+start=time.time()
+
+
+
+news={}
+#news[0]={'title':'','link':'','content':''}
+
+
+def scraping_soup(link, category, page):
+    url = requests.get(link)
+    if (page=="vnexpress"):
+        soup = BeautifulSoup(url.content, 'lxml')
+    else:
+        soup = BeautifulSoup(url.content, 'html.parser')
+    items = soup.findAll('item')
+    i=0
+    for item in items:
+
+        title = item.title.text
+        link = item.guid.text
+        description = item.description.text
+        print(title)
+        #--------
+
+
+        news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}
+        i+=1
+        if i==30: break
+
+
+#def Suggest_news_thethao():
+    #scraping_soup('https://vnexpress.net/rss/the-thao.rss','thethao', 'vnexpress')
+
+
+def Suggest_news_thoisu_chinhtri():
+    scraping_soup('https://vnexpress.net/rss/thoi-su.rss', 'thoisu', 'vnexpress')
+    #scraping_soup('https://vtv.vn/trong-nuoc/chinh-tri.rss', 'thoisu')
+    scraping_soup('https://toquoc.vn/rss/thoi-su-1.rss', 'thoisu','toquoc')
+    #scraping_soup('https://baotintuc.vn/thoi-su.rss', 'thoisu', 'baotintuc')
+    #scraping_soup('https://vietnamnet.vn/rss/thoi-su.rss', 'thoisu', 'vietnamnet')
+   # scraping_soup('https://laodong.vn/rss/thoi-su.rss', 'thoisu', 'laodong')
+
+
+#def Suggest_news_vanhoa():
+    #scraping_soup('https://toquoc.vn/rss/van-hoa-10.rss', 'vanhoa', 'toquoc')
+    #scraping_soup('https://baotintuc.vn/van-hoa.rss', 'vanhoa', 'baotintuc')
+    #scraping_soup('https://laodong.vn/rss/van-hoa-giai-tri.rss', 'vanhoa', 'laodong')
+
+
+
+#Suggest_news_thethao()
+Suggest_news_thoisu_chinhtri()
+#Suggest_news_vanhoa()
+
+print(news)
+
+
+
+
+
+
+
+
+
+

BIN
__pycache__/Suggest_news.cpython-310.pyc


BIN
__pycache__/creatdata.cpython-310.pyc


BIN
__pycache__/main.cpython-310.pyc


File diff suppressed because it is too large
+ 42377 - 0
baothethao.txt


File diff suppressed because it is too large
+ 685 - 0
baothethao2.txt


File diff suppressed because it is too large
+ 1 - 0
count_dict.txt


+ 55 - 0
crawldatanews3.py

@@ -0,0 +1,55 @@
+import sys
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+
+from bs4 import BeautifulSoup
+import requests
+import time
+
+start=time.time()
+
+fi=open("linksthethao.txt","r")
+fo=open("baothethao2.txt","w")
+
+i=0
+
+for line in fi.readlines():
+    link=line.strip()
+    url=requests.get(link)
+
+    t_soup = BeautifulSoup(url.text, 'lxml')
+
+    for headline in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
+        fo.write(headline.text)
+        fo.write('\n')
+    for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
+        fo.write(description.text)
+        fo.write('\n')
+    str = ''
+    for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
+        for content in contents.findAll('p'):
+            fo.write(content.text)
+            fo.write('\n')
+
+    i+=1
+    print(i)
+    if (i==50): break
+
+
+fi.close()
+fo.close()
+print("--- %s seconds ---" % (time.time() - start))
+
+
+
+
+
+
+
+
+
+
+

+ 14 - 0
creatdata.py

@@ -0,0 +1,14 @@
+import json
+
+twowords=set()
+stopwords=set()
+
+for line in open("vietdict.txt", 'r'):
+    a=json.loads(line)
+    t_str=a["text"]
+    tmp=str.split(t_str)
+    if (len(tmp)==2):
+        set.add(twowords,t_str)
+
+for line in open("vietnamese-stopwords.txt", 'r'):
+    set.add(stopwords,line[:-1])

BIN
gologin_zeroprofile.zip


+ 58 - 0
gui.py

@@ -0,0 +1,58 @@
+from tkinter import *
+import tkinter as tk
+from tkinter import ttk
+import webbrowser
+from main import res_cos
+from main import link_title
+
+# MacOS
+chrome_path = 'open -a /Applications/Google\ Chrome.app %s'
+
+
+
+root = tk.Tk()
+root.geometry("400x400")
+root.title("News Widget")
+tabControl = ttk.Notebook(root)
+
+tab1 = ttk.Frame(tabControl)
+tab2 = ttk.Frame(tabControl)
+
+
+
+def weblink(*args):
+    index = lbx1.curselection()[0]
+    item = lbx1.get(index)
+    webbrowser.open_new(link_title[item])
+    lbx1.delete(index)
+    lbx2.insert(lbx2.size(),item)
+
+tabControl.add(tab1, text='News')
+tabControl.add(tab2, text='History')
+tabControl.pack(expand=1, fill="both")
+lbx1 = tk.Listbox(tab1,width=40,height=40)
+lbx2 = tk.Listbox(tab2,width=40,height=40)
+
+lbx1.bind('<<ListboxSelect>>', weblink)
+for key in res_cos:
+    lbx1.insert(END, key)
+
+
+
+
+
+
+lbx1.pack()
+lbx2.pack()
+
+root.mainloop()
+"""
+lbx=Listbox(root,width=40,height=40)
+lbx.pack(pady=15)
+
+#AddItems
+for i in range(30):
+    lbx.insert(i,"this is a test "+str(i))
+
+
+"""

+ 70 - 0
linksthethao.txt

@@ -0,0 +1,70 @@
+https://thanhnien.vn/cac-nuoc-chuc-mung-quoc-khanh-viet-nam-post1495932.html
+https://thanhnien.vn/tong-bi-thu-nguyen-phu-trong-dang-huong-tuong-niem-chu-tich-ho-chi-minh-post1495512.html
+https://thanhnien.vn/lanh-dao-dang-nha-nuoc-vieng-chu-tich-ho-chi-minh-nhan-quoc-khanh-29-post1495472.html
+https://thanhnien.vn/tphcm-phat-huy-vun-dap-moi-quan-he-huu-nghi-va-hop-tac-viet-nam---lao-post1493485.html
+https://thanhnien.vn/thu-tuong-pham-minh-chinh-cac-ton-giao-luon-cung-dat-nuoc-vuot-qua-kho-khan-thach-thuc-post1493404.html
+https://thanhnien.vn/can-giai-phap-de-doan-vien-thanh-nien-tham-gia-xay-dung-dang-post1494969.html
+https://thanhnien.vn/tong-thong-belarus-quan-doi-ukraine-se-can-thiep-chinh-tri-de-xung-dot-som-ket-thuc-post1495942.html
+https://thanhnien.vn/pho-sa-te-cao-van-lau-40-nam-hut-khach-nho-nuoc-leo-dau-phong-doc-dao-post1495770.html
+https://thanhnien.vn/lay-ly-do-ro-ri-nga-dong-duong-ong-nord-stream-1-vo-thoi-han-post1496002.html
+https://thanhnien.vn/tau-san-bay-noi-dia-dau-tien-cua-an-do-co-ti-le-noi-dia-hoa-bao-nhieu-post1495970.html
+https://thanhnien.vn/antony-ra-mat-o-tran-manchester-united-arsenal-co-qua-voi-vang-post1496012.html
+https://thanhnien.vn/csgt-tphcm-truy-duoi-khong-che-nghi-pham-cuop-xe-may-post1495965.html
+https://thanhnien.vn/ninh-binh-mot-nguoi-bi-dien-giat-tu-vong-khi-dang-sua-duong-dien-post1496049.html
+https://thanhnien.vn/5-nhom-trieu-chung-can-can-thiep-y-te-khan-cap-sau-mac-covid-19-post1496048.html
+https://thanhnien.vn/quang-binh-200-canh-sat-vay-bat-hang-chuc-nguoi-bay-lac-o-karaoke-thien-duong-2-post1496043.html
+https://thanhnien.vn/tphcm-trom-chim-bi-truy-duoi-lien-xit-hoi-cay-chong-tra-post1496070.html
+https://thanhnien.vn/kon-tum-xac-minh-thiet-hai-cua-nguoi-trong-sam-ngoc-linh-de-xem-xet-khoanh-no-post1495955.html
+https://thanhnien.vn/vu-khai-thac-lau-15-trieu-tan-quang-van-ban-trai-luat-cua-ubnd-tinh-lao-cai-post1496019.html
+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
+https://thanhnien.vn/pv-thanh-nien-ho-tro-giai-cuu-2-nguoi-bi-lua-ban-sang-campuchia-post1495931.html
+https://thanhnien.vn/pho-bien-phan-thiet-dang-khoac-chiec-ao-chat-post1495882.html
+https://thanhnien.vn/vi-sao-giam-doc-so-tn-mt-thanh-hoa-mai-nhu-thang-bat-ngo-xin-chuyen-cong-tac-post1489705.html
+https://thanhnien.vn/phat-huy-suc-manh-cua-nhan-dan-trong-su-nghiep-bao-ve-an-ninh-quoc-gia-post1489659.html
+https://thanhnien.vn/ky-luat-nhieu-can-bo-lien-quan-dai-an-binh-duong-post1489663.html
+https://thanhnien.vn/chu-tich-nuoc-chu-tri-hoi-thao-ve-bao-ve-to-quoc-trong-tinh-hinh-moi-post1488764.html
+https://thanhnien.vn/quang-ninhpho-thu-tuong-pham-binh-minh-du-ngay-hoi-toan-dan-bao-ve-an-ninh-to-quoc-post1488511.html
+https://thanhnien.vn/phong-trao-bao-ve-an-ninh-to-quoc-that-su-la-noi-binh-yen-dang-song-post1488466.html
+https://thanhnien.vn/giu-gin-lau-dai-bao-ve-tuyet-doi-an-toan-thi-hai-chu-tich-ho-chi-minh-post1488249.html
+https://thanhnien.vn/bo-nhiem-vien-truong-vien-ksnd-tinh-dong-nai-post1487484.html
+https://thanhnien.vn/ngay-hoi-toan-dan-bao-ve-an-ninh-to-quoc-phong-phu-thiet-thuc-post1487368.html
+https://thanhnien.vn/day-du-co-so-de-xay-dung-nha-nuoc-phap-quyen-xhcn-post1487367.html
+https://thanhnien.vn/dong-thap-dieu-dong-2-bi-thu-huyen-ve-lam-pho-ban-to-chuc-va-tuyen-giao-post1487296.html
+https://thanhnien.vn/bo-cong-an-to-chuc-ky-niem-38-nam-thang-loi-ke-hoach-phan-gian-cm12-post1487287.html
+https://thanhnien.vn/nghien-cuu-nhan-rong-mo-hinh-cong-nhan-moi-truong-chien-si-tuan-tra-post1486150.html
+https://thanhnien.vn/nguyen-chu-tich-nuoc-truong-tan-sang-nhan-huy-hieu-50-nam-tuoi-dang-post1486112.html
+https://thanhnien.vn/ky-niem-110-nam-ngay-sinh-chu-tich-hoi-dong-nha-nuoc-vo-chi-cong-post1488689.html
+https://thanhnien.vn/chu-tich-nuoc-nguyen-xuan-phuc-dang-huong-khu-luu-niem-nha-cach-mang-vo-chi-cong-post1485634.html
+https://thanhnien.vn/pho-chu-tich-quoc-hoi-tran-thanh-man-du-khai-giang-nam-hoc-moi-tai-kien-giang-post1495993.html
+https://thanhnien.vn/cach-day-con-dung-dan-nhan-cho-con-post1495888.html
+https://thanhnien.vn/boc-tham-may-rui-vao-truong-mam-non-dau-tu-xay-nha-dung-bo-quen-truong-hoc-post1495938.html
+https://thanhnien.vn/thu-tuong-khong-de-hoc-sinh-nao-khong-duoc-toi-truong-khong-co-sach-giao-khoa-post1496045.html
+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
+https://thanhnien.vn/5-nhom-trieu-chung-can-can-thiep-y-te-khan-cap-sau-mac-covid-19-post1496048.html
+https://thanhnien.vn/cu-tri-ba-ria---vung-tau-than-kho-mua-ve-bay-con-dao-bo-gtvt-noi-gi-post1496087.html
+https://thanhnien.vn/khu-nhac-nuoc-lon-nhat-dong-nam-a-mo-cua-don-khach-post1496024.html
+https://thanhnien.vn/nhieu-cua-hang-o-ca-mau-het-xang-dau-post1496010.html
+https://thanhnien.vn/he-lo-nhung-thu-ong-trump-quan-tam-khi-con-la-tong-thong-my-post1496060.html
+https://thanhnien.vn/tham-quyen-phong-toa-phan-lon-thanh-pho-tien-hanh-xet-nghiem-covid-19-post1496093.html
+https://thanhnien.vn/nong-phi-cong-lai-may-bay-luon-vong-doa-lao-vao-sieu-thi-walmart-o-my-post1496095.html
+https://thanhnien.vn/boc-tham-vao-truong-mam-non-loi-khong-chi-thuoc-ve-nganh-giao-duc-post1495478.html
+https://thanhnien.vn/nhung-nhom-nho-thien-nguyen-post1490701.html
+https://thanhnien.vn/trut-gian-loa-phuong-post1482932.html
+https://thanhnien.vn/vu-khai-thac-lau-15-trieu-tan-quang-van-ban-trai-luat-cua-ubnd-tinh-lao-cai-post1496019.html
+https://thanhnien.vn/tphcm-di-bo-sang-duong-nguoi-phu-nu-bi-cuop-giat-dien-thoai-vi-tien-post1495874.html
+https://thanhnien.vn/thua-thien---hue-nu-sinh-lop-8-bi-ban-danh-sau-buoi-hoc-noi-quy-post1495850.html
+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
+https://thanhnien.vn/tu-vu-cho-pitbull-tan-cong-nguoi-phu-nu-tu-vong-nen-cam-nuoi-cho-pitbull-post1496011.html
+https://thanhnien.vn/nguoi-dan-o-ha-tinh-thiet-hai-nang-do-ngao-nuoi-chet-hang-loat-post1496031.html
+https://thanhnien.vn/dong-nai-giai-ngan-311-ti-dong-ho-tro-tien-thue-tro-cho-cong-nhan-post1493145.html
+https://thanhnien.vn/binh-phuoc-da-co-1011-dia-phuong-chi-tra-tien-thue-nha-cho-nguoi-lao-dong-post1493378.html
+https://thanhnien.vn/tphcm-1000-cong-nhan-cong-ty-nidec-servo-ngung-viec-kien-nghi-tang-luong-post1491722.html
+https://thanhnien.vn/dong-nai-dien-tich-toi-thieu-dat-nong-nghiep-o-nong-thon-sau-tach-thua-la-2000-m2-post1496037.html
+https://thanhnien.vn/livestream-o-phien-toa-van-co-the-bi-phat-tien-post1493272.html
+https://thanhnien.vn/khi-nao-cccd-gan-chip-va-dinh-danh-dien-tu-phat-huy-het-chuc-nang-post1492779.html
+https://thanhnien.vn/dan-que-me-lam-du-lich-la-lung-ly-lao-lo-post1495787.html
+https://thanhnien.vn/hoang-than-do-souphanouvong-voi-viet-nam-tro-thanh-nguoi-chien-si-cach-mang-post1495786.html
+https://thanhnien.vn/giu-bien-dao-vung-dong-bac-post1495671.html
+https://thanhnien.vn/to-quoc-o-truong-sa-post1495833.html
+https://thanhnien.vn/phat-trien-cong-nghiep-quoc-phong-chu-dong-va-hien-dai-post1490827.html
+https://thanhnien.vn/chu-tich-nuoc-chu-tri-hoi-thao-ve-bao-ve-to-quoc-trong-tinh-hinh-moi-post1488764.html

+ 209 - 0
main.py

@@ -0,0 +1,209 @@
+#Importing required module
+import numpy as np
+from nltk.tokenize import  word_tokenize
+import nltk
+nltk.download('punkt')
+from creatdata import twowords
+from creatdata import stopwords
+import time
+import json
+from Suggest_news import news
+
+
+
+start=time.time()
+
+
+#print(len(twowords))
+
+#------------------------------------some functions---------------------------------------------------------
+def count_dict(sentences):
+    word_count = {}
+    for word in word_set:
+        word_count[word] = 0
+        for sent in sentences:
+            if word in sent:
+                word_count[word] += 1
+    return word_count
+
+#Term Frequency
+def termfreq(document, word):
+    N = len(document)
+    occurance = len([token for token in document if token == word])
+    return occurance/N
+
+#Inverse Document Frequency
+def inverse_doc_freq(word):
+    try:
+        word_occurance = word_count[word] + 1
+    except:
+        word_occurance = 1
+    return np.log(total_documents/word_occurance)
+
+#TF-IDF
+def tf_idf(sentence):
+    tf_idf_vec = np.zeros((len(word_set),))
+    for word in sentence:
+      if word in word_set:
+        tf = termfreq(sentence,word)
+        idf = inverse_doc_freq(word)
+        value = tf*idf
+        tf_idf_vec[index_dict[word]] = value
+    return tf_idf_vec
+
+#Cosine similarity
+def cos_cal(vec_a,vec_b):
+  cos_sim = np.inner(vec_a, vec_b)/(np.linalg.norm(vec_a)* np.linalg.norm(vec_b))
+  return cos_sim
+
+#jaccard similarity
+def jaccard_similarity(A, B):
+
+    tmp=0
+    for i in range(len(A)):
+        if A[i]==B[i]: tmp+=1
+
+    return tmp/len(A)
+
+
+#*********************************************************************************************
+
+#Preprocessing the text data
+sentences = []
+word_set = []
+
+with open("baothethao2.txt") as f:
+  contents = f.readlines()
+
+#print(contents)
+
+for sent in contents:
+    tmp=""
+    check=False
+    x = [i.lower() for  i in word_tokenize(sent) if i.isalpha()]
+    sentences.append(x)
+    check2=False
+    for word in x:
+        if check:
+            check=False
+            continue
+        tmp2=tmp+word
+        tmp = word + ' '
+        if (tmp2 in twowords) and (tmp2 not in word_set):
+            if check2: word_set.pop()
+            word_set.append(tmp2)
+            check = True
+            check2=False
+        elif (word in word_set) or (word in stopwords): continue
+        else:
+            word_set.append(word)
+            check2=True
+with open('result.txt', 'w', encoding='utf8') as json_file:
+    json.dump(word_set, json_file, ensure_ascii=False)
+json_file.close()
+#print(len(word_set))
+
+#Set of vocab
+word_set = set(word_set)
+#Total documents in our corpus
+total_documents = len(sentences)
+
+#Creating an index for each word in our vocab.
+index_dict = {} #Dictionary to store index for each word
+i = 0
+for word in word_set:
+    index_dict[word] = i
+    i += 1
+#Creating word_count
+word_count = count_dict(sentences)
+with open('count_dict.txt', 'w', encoding='utf8') as json_file:
+    json.dump(word_count, json_file, ensure_ascii=False)
+json_file.close()
+
+#TF-IDF Encoded text corpus
+vectors = []
+for sent in sentences:
+    vec = tf_idf(sent)
+    vectors.append(vec)
+
+#Creat user-vector
+tf_words = {}
+vec_user = []
+total_words = 0
+for sent in sentences:
+    total_words += len(sent)
+
+avg=0
+jac_user_vec=[]
+
+for word in word_set:
+  tf_words[word] = 0
+  for sent in sentences:
+    if word in sent:
+      for tmp_word in sent:
+        if (word==tmp_word):
+            tf_words[word] += 1/total_words
+
+  avg+=tf_words[word]/len(word_set)
+
+set(word_set)
+
+with open('tf_word.txt', 'w', encoding='utf8') as json_file:
+    json.dump(tf_words, json_file, ensure_ascii=False)
+json_file.close()
+
+for x,y in tf_words.items():
+  if y>=avg: jac_user_vec.append(1)
+  else: jac_user_vec.append(0)
+  vec_user.append(y)
+
+#print(vec_user)
+
+#*********************************************************************************************
+#Testing
+
+'''with open("testingrss.txt") as fii:
+  thethao = fii.readlines()'''
+
+res_cos={}
+link_title={}
+
+for i in range(len(news)):
+    sent=news[i]['content']
+    x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
+    sentences.append(x)
+
+    t_vec=tf_idf(x)
+
+    jac_t_vec=[]
+    avg=0
+    for value in t_vec:
+        avg+=value/len(t_vec)
+    for x in t_vec:
+        if x>=avg: jac_t_vec.append(1)
+        else: jac_t_vec.append(0)
+
+
+    A = jac_user_vec
+    B = jac_t_vec
+    val=cos_cal(t_vec,vec_user)
+    res_cos[news[i]['title']]=val
+    link_title[news[i]['title']]=news[i]['link']
+    sentences.pop()
+
+res_cos={k: v for k, v in sorted(res_cos.items(), key=lambda item: item[1], reverse=True)}
+#res_jac={k: v for k, v in sorted(res_jac.items(), key=lambda item: item[1], reverse=True)}
+'''
+for key in res_cos:
+    print(key, ' : ', res_cos[key],'\n')
+
+print('----------------------- \n')
+
+"""
+for key in res_jac:
+    print(key, ' : ', res_jac[key],'\n')
+"""
+
+print("--- %s seconds ---" % (time.time() - start))
+'''
+print(link_title)

File diff suppressed because it is too large
+ 1 - 0
result.txt


File diff suppressed because it is too large
+ 654 - 0
resutl2.txt


+ 17 - 0
testinggui.py

@@ -0,0 +1,17 @@
+from tkinter import *
+import webbrowser
+
+def weblink(*args):
+    index = lb.curselection()[0]
+    item = lb.get(index)
+    if 'https://' in item:
+        webbrowser.open_new(item)
+
+
+root = Tk()
+lb = Listbox(root)
+lb.bind('<<ListboxSelect>>', weblink)
+for item in list_of_items:
+    lb.insert(END, item)
+lb.pack()
+root.mainloop()

+ 0 - 0
testingrss.txt


File diff suppressed because it is too large
+ 1 - 0
tf_word.txt


+ 123 - 0
tmp.py

@@ -0,0 +1,123 @@
+import sys
+import time
+# from sys import platform
+#!/usr/bin/python
+
+# -*- coding: utf8 -*-
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from pygologin.gologin import GoLogin
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+from selenium.common.exceptions import StaleElementReferenceException
+import json
+
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+import requests
+
+'''gl = GoLogin({
+    "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MmY3Yjk3NGQxZGNkYmJjYzA5ODUyODciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MmY3Y2E2OTgwZGRjMDU1YjliZTVlMjMifQ.__GwUyY80hIVJ8o2Ak0wntHYizNwWrm42h-k7q0xxJE",
+    "profile_id": "62f7b974d1dcdb43cb985289",
+    # "port": random_port
+}'''
+capa = DesiredCapabilities.CHROME
+capa["pageLoadStrategy"] = "none"
+
+chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
+#debugger_address = gl.start()
+chrome_options = Options()
+chrome_options.add_experimental_option("useAutomationExtension", False)
+chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
+#chrome_options.add_experimental_option("debuggerAddress", debugger_address)
+driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options, desired_capabilities=capa)
+#driver=webdriver.Chrome("/Users/nguyenductai/Downloads/chromedriver2")'''
+# ----------------------------
+
+f=open("linksthethao.txt", "w")
+
+for i in range(2,3):
+    url='https://thanhnien.vn/thoi-su/chinh-tri/?trang='+str(i)
+    url1 = requests.get(url)
+    soup = BeautifulSoup(url1.content, 'lxml')
+    #items = soup.findAll('item')
+
+    for links in soup.findAll('article', {'class': "story"}):
+        for a in links.findAll('a', {'class': "story__title cms-link"} ,href=True):
+            f.write(a['href'])
+            f.write('\n')
+
+    print(i,'\n')
+
+
+"""    
+url1=requests.get('https://vnexpress.net/rss/the-thao.rss')
+soup=BeautifulSoup(url1.content, 'xml')
+items=soup.find_all('item')
+wait=WebDriverWait(driver,200)
+
+'''driver.get("https://vnexpress.net/neymar-mbappe-va-vu-penaltygate-2-0-4501139.html")
+wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/h1')))
+str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
+str=str[:str.rfind('\n')]
+str=str[:str.rfind('\n')]
+str=str[:str.rfind('\n')]
+print(str)'''
+
+i=0
+for item in items:
+    i+=1
+    title=item.title.text
+    link=item.link.text
+    #print("Link: ", link, '\n\n')
+    url2=requests.get(link)
+    #---------
+    t_soup=BeautifulSoup(url2.content,'lxml')
+    for headline in t_soup.findAll('h1',{'class':'title-detail'}):
+        f.write(headline.text)
+        f.write('\n')
+    for description in t_soup.findAll('p',{'class':'description'}):
+        f.write(description.text)
+        f.write('\n')
+    str=''
+    for normal in t_soup.findAll('p', {'class': 'Normal'}):
+        str+=normal.text+'\n'
+
+    str = str[:str.rfind('\n')]
+    str = str[:str.rfind('\n')]
+    str+='\n'
+    f.write(str)
+        #print('\n')
+    print(i,'\n')
+
+
+    #print(t_soup)
+    #-----------
+    '''driver.get(link)
+    time.sleep(1)
+    wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/article')))
+    str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
+    str = str[:str.rfind('\n')]
+    str = str[:str.rfind('\n')]
+    str = str[:str.rfind('\n')]
+    str+='\n'
+    f.write(str)
+    print(i)
+    #driver.execute_script("window.stop();")
+    driver.refresh()'''
+    #-------------
+
+"""
+f.close()
+
+
+
+
+
+
+
+
+

File diff suppressed because it is too large
+ 36534 - 0
vietdict.txt


File diff suppressed because it is too large
+ 1942 - 0
vietnamese-stopwords.txt