| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226 |
- import sys
- import time
- # from sys import platform
- #!/usr/bin/python
- # -*- coding: utf8 -*-
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from pygologin.gologin import GoLogin
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import NoSuchElementException
- from selenium.common.exceptions import StaleElementReferenceException
- import json
- from underthesea import ner
- from bs4 import BeautifulSoup
- import requests
- from datetime import datetime
- from datetime import timedelta
- gl = GoLogin({
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzJjMjliMjJlMjIxZjVlMjc5Yzc4ZTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzJjMmI3OTlmYjIxNDI0YTFmNTQzZTUifQ.GR4iJFqUVRuI3XO_Ns3cfiII2m8CactTGU9jhNaSf-k",
- "profile_id": "632c5184cef566f424ef2e3c",
- # "port": random_port
- })
- chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
- debugger_address = gl.start()
- chrome_options = Options()
- chrome_options.add_experimental_option("debuggerAddress", debugger_address)
- driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
- # ----------------------------
- '''
- chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
- chrome_options = Options()
- driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
- '''
- start=time.time()
- # ----------some def------------------
- def find_first_link():
- for tmp in driver.find_elements(By.TAG_NAME,'a'):
- extracted_link=tmp.get_attribute("href")
- if (extracted_link!=None):
- if (extracted_link.find("https://"+site+"/")==0):
- print(extracted_link)
- def create_link(site):
- link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
- date_from=date-timedelta(days=1)
- date_to=date+timedelta(days=1)
- year_from=date_from.strftime("%Y")
- year_to=date_to.strftime("%Y")
- month_from=date_from.strftime("%m")
- month_to=date_to.strftime("%m")
- day_from=date_from.strftime("%d")
- day_to=date_to.strftime("%d")
- tmp = 'min%3A'+month_from+'%2F'+day_from+'%2F'+year_from+'%2Ccd_max%3A'+month_to+ '%2F'+ day_to+ '%2F'+ year_to+ '&tbm='
- return link+tmp
- def crawl(link,site):
- news = {}
- t_title = ""
- t_description = ""
- t_contents = ''
- url = requests.get(link)
- t_soup = BeautifulSoup(url.text, 'lxml')
- if (site=="thanhnien.vn"):
- for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
- t_title=title.text
- for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
- t_description=description.text
- for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
- for contents in t_soup.findAll('div', {'class': 'cms-body'}):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
-
- if (site=="vnexpress.net"):
- for title in t_soup.findAll('h1', {'class': 'title-detail'}):
- t_title=title.text
- for description in t_soup.findAll('p', {'class': 'description'}):
- t_description=description.text
- for contents in t_soup.findAll('p', {'class': 'Normal'}):
- t_contents+=contents.text+". "
- if (site=="tienphong.vn"):
- for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
- t_title=title.text
- for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
- t_description=description.text
- for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
- for contents in t_soup.findAll('td', {'class': 'caption'}):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
- if (site=="vov.vn"):
- for title in t_soup.findAll('div', {'class': 'row article-title'}):
- t_title=title.text
- for description in t_soup.findAll('div', {'class': 'row article-summary'}):
- t_description=description.text
- for contents in t_soup.findAll('div', {'class': 'row article-content'}):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
- for contents in t_soup.findAll('td'):
- for content in contents.findAll('p'):
- t_contents+=content.text+". "
- if (site=="nhandan.vn"):
- for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
- t_title = title.text
- for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
- t_description = description.text
- for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
- for content in contents.findAll('p'):
- t_contents += content.text + ". "
- for contents in t_soup.findAll('td', {'class': 'caption'}):
- for content in contents.findAll('p'):
- t_contents += content.text + ". "
- if (site=="zingnews.vn"):
- for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
- t_title = title.text
- for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
- t_description = description.text
- for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
- for content in contents.findAll('p'):
- t_contents += content.text + ". "
- if (site=="tuoitre.vn"):
- for title in t_soup.findAll('h1', {'class': 'article-title'}):
- t_title = title.text
- for description in t_soup.findAll('h2', {'class': 'sapo'}):
- t_description = description.text
- for contents in t_soup.findAll('div', {'class': 'content fck'}):
- for content in contents.findAll('p'):
- t_contents += content.text + ". "
- news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
- if t_title=="": return {}
- return news
- #-----------------------
- sites={'vnexpress.net','thanhnien.vn','tienphong.vn','vov.vn','nhandan.vn','zingnews.vn','tuoitre.vn'}
- fi=open("baomoi_testing_crawling.txt","r")
- fo=open("testing.txt",'w')
- i=0
- #--------------------------
- for line in fi.readlines():
- a=json.loads(line)
- t_str=ner(a["title"])
- #---
- t_date=a["date"]
- year=int(t_date[0:4])
- month=int(t_date[5:7])
- day=int(t_date[8:10])
- date=datetime(year,month,day)
- #---
- searching_key= ''
- for words in t_str:
- if (words[1]=="N") or (words[1]=="Np"):
- searching_key+= '"' + words[0] + '"' + "%2B"
- searching_key=searching_key.replace(" ", "+")
- searching_key= searching_key[0:len(searching_key) - 3]
- source = [line]
- for site in sites:
- #print(create_link(site))
- check_link=create_link(site)
- driver.get(check_link)
- time.sleep(0.5)
- #print(check_link)
- #print(driver.current_url)
- while (driver.current_url.find("https://www.google.com/search")==-1):
- driver.delete_all_cookies()
- driver.refresh()
- driver.get(check_link)
- time.sleep(0.5)
- #print("#---------------------"+"\n")
- '''
- while (driver.current_url!=check_link):
- driver.delete_all_cookies()
- driver.refresh()
- driver.get(check_link)
- '''
- #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
- for tmp in driver.find_elements(By.TAG_NAME, 'a'):
- extracted_link = tmp.get_attribute("href")
- if (extracted_link != None):
- if (extracted_link.find("https://" + site + "/") == 0):
- #print(extracted_link)
- print(extracted_link)
- news = crawl(extracted_link,site)
- if news!={}:
- source.append(news)
- break
- #time.sleep(2)
- #crawling(create_link(site))
- # print(source)
- fo.write(json.dumps(source, ensure_ascii=False))
- fo.write('\n')
- print("--- %s seconds ---" % (time.time() - start))
|