crawl_from_others.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. import sys
  2. import time
  3. # from sys import platform
  4. #!/usr/bin/python
  5. # -*- coding: utf8 -*-
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.options import Options
  8. from pygologin.gologin import GoLogin
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12. from selenium.common.exceptions import NoSuchElementException
  13. from selenium.common.exceptions import StaleElementReferenceException
  14. import json
  15. from underthesea import ner
  16. from bs4 import BeautifulSoup
  17. import requests
  18. from datetime import datetime
  19. from datetime import timedelta
  20. gl = GoLogin({
  21. "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzJjMjliMjJlMjIxZjVlMjc5Yzc4ZTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzJjMmI3OTlmYjIxNDI0YTFmNTQzZTUifQ.GR4iJFqUVRuI3XO_Ns3cfiII2m8CactTGU9jhNaSf-k",
  22. "profile_id": "632c5184cef566f424ef2e3c",
  23. # "port": random_port
  24. })
  25. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
  26. debugger_address = gl.start()
  27. chrome_options = Options()
  28. chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  29. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  30. # ----------------------------
  31. '''
  32. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
  33. chrome_options = Options()
  34. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  35. '''
  36. start=time.time()
  37. # ----------some def------------------
  38. def find_first_link():
  39. for tmp in driver.find_elements(By.TAG_NAME,'a'):
  40. extracted_link=tmp.get_attribute("href")
  41. if (extracted_link!=None):
  42. if (extracted_link.find("https://"+site+"/")==0):
  43. print(extracted_link)
  44. def create_link(site):
  45. link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
  46. date_from=date-timedelta(days=1)
  47. date_to=date+timedelta(days=1)
  48. year_from=date_from.strftime("%Y")
  49. year_to=date_to.strftime("%Y")
  50. month_from=date_from.strftime("%m")
  51. month_to=date_to.strftime("%m")
  52. day_from=date_from.strftime("%d")
  53. day_to=date_to.strftime("%d")
  54. tmp = 'min%3A'+month_from+'%2F'+day_from+'%2F'+year_from+'%2Ccd_max%3A'+month_to+ '%2F'+ day_to+ '%2F'+ year_to+ '&tbm='
  55. return link+tmp
  56. def crawl(link,site):
  57. news = {}
  58. t_title = ""
  59. t_description = ""
  60. t_contents = ''
  61. url = requests.get(link)
  62. t_soup = BeautifulSoup(url.text, 'lxml')
  63. if (site=="thanhnien.vn"):
  64. for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
  65. t_title=title.text
  66. for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
  67. t_description=description.text
  68. for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
  69. for content in contents.findAll('p'):
  70. t_contents+=content.text+". "
  71. for contents in t_soup.findAll('div', {'class': 'cms-body'}):
  72. for content in contents.findAll('p'):
  73. t_contents+=content.text+". "
  74. if (site=="vnexpress.net"):
  75. for title in t_soup.findAll('h1', {'class': 'title-detail'}):
  76. t_title=title.text
  77. for description in t_soup.findAll('p', {'class': 'description'}):
  78. t_description=description.text
  79. for contents in t_soup.findAll('p', {'class': 'Normal'}):
  80. t_contents+=contents.text+". "
  81. if (site=="tienphong.vn"):
  82. for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
  83. t_title=title.text
  84. for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
  85. t_description=description.text
  86. for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
  87. for content in contents.findAll('p'):
  88. t_contents+=content.text+". "
  89. for contents in t_soup.findAll('td', {'class': 'caption'}):
  90. for content in contents.findAll('p'):
  91. t_contents+=content.text+". "
  92. if (site=="vov.vn"):
  93. for title in t_soup.findAll('div', {'class': 'row article-title'}):
  94. t_title=title.text
  95. for description in t_soup.findAll('div', {'class': 'row article-summary'}):
  96. t_description=description.text
  97. for contents in t_soup.findAll('div', {'class': 'row article-content'}):
  98. for content in contents.findAll('p'):
  99. t_contents+=content.text+". "
  100. for contents in t_soup.findAll('td'):
  101. for content in contents.findAll('p'):
  102. t_contents+=content.text+". "
  103. if (site=="nhandan.vn"):
  104. for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
  105. t_title = title.text
  106. for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
  107. t_description = description.text
  108. for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
  109. for content in contents.findAll('p'):
  110. t_contents += content.text + ". "
  111. for contents in t_soup.findAll('td', {'class': 'caption'}):
  112. for content in contents.findAll('p'):
  113. t_contents += content.text + ". "
  114. if (site=="zingnews.vn"):
  115. for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
  116. t_title = title.text
  117. for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
  118. t_description = description.text
  119. for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
  120. for content in contents.findAll('p'):
  121. t_contents += content.text + ". "
  122. if (site=="tuoitre.vn"):
  123. for title in t_soup.findAll('h1', {'class': 'article-title'}):
  124. t_title = title.text
  125. for description in t_soup.findAll('h2', {'class': 'sapo'}):
  126. t_description = description.text
  127. for contents in t_soup.findAll('div', {'class': 'content fck'}):
  128. for content in contents.findAll('p'):
  129. t_contents += content.text + ". "
  130. news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
  131. if t_title=="": return {}
  132. return news
  133. #-----------------------
  134. sites={'vnexpress.net','thanhnien.vn','tienphong.vn','vov.vn','nhandan.vn','zingnews.vn','tuoitre.vn'}
  135. fi=open("baomoi_testing_crawling.txt","r")
  136. fo=open("testing.txt",'w')
  137. i=0
  138. #--------------------------
  139. for line in fi.readlines():
  140. a=json.loads(line)
  141. t_str=ner(a["title"])
  142. #---
  143. t_date=a["date"]
  144. year=int(t_date[0:4])
  145. month=int(t_date[5:7])
  146. day=int(t_date[8:10])
  147. date=datetime(year,month,day)
  148. #---
  149. searching_key= ''
  150. for words in t_str:
  151. if (words[1]=="N") or (words[1]=="Np"):
  152. searching_key+= '"' + words[0] + '"' + "%2B"
  153. searching_key=searching_key.replace(" ", "+")
  154. searching_key= searching_key[0:len(searching_key) - 3]
  155. source = [line]
  156. for site in sites:
  157. #print(create_link(site))
  158. check_link=create_link(site)
  159. driver.get(check_link)
  160. time.sleep(0.5)
  161. #print(check_link)
  162. #print(driver.current_url)
  163. while (driver.current_url.find("https://www.google.com/search")==-1):
  164. driver.delete_all_cookies()
  165. driver.refresh()
  166. driver.get(check_link)
  167. time.sleep(0.5)
  168. #print("#---------------------"+"\n")
  169. '''
  170. while (driver.current_url!=check_link):
  171. driver.delete_all_cookies()
  172. driver.refresh()
  173. driver.get(check_link)
  174. '''
  175. #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
  176. for tmp in driver.find_elements(By.TAG_NAME, 'a'):
  177. extracted_link = tmp.get_attribute("href")
  178. if (extracted_link != None):
  179. if (extracted_link.find("https://" + site + "/") == 0):
  180. #print(extracted_link)
  181. print(extracted_link)
  182. news = crawl(extracted_link,site)
  183. if news!={}:
  184. source.append(news)
  185. break
  186. #time.sleep(2)
  187. #crawling(create_link(site))
  188. # print(source)
  189. fo.write(json.dumps(source, ensure_ascii=False))
  190. fo.write('\n')
  191. print("--- %s seconds ---" % (time.time() - start))