crawl_from_others_tesing_google.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. from sys import platform
  2. import time
  3. # from sys import platform
  4. #!/usr/bin/python
  5. # -*- coding: utf8 -*-
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.options import Options
  8. from pygologin.gologin import GoLogin
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support.ui import WebDriverWait
  11. from selenium.webdriver.support import expected_conditions as EC
  12. from selenium.common.exceptions import NoSuchElementException
  13. from selenium.common.exceptions import StaleElementReferenceException
  14. import json
  15. from underthesea import ner
  16. from bs4 import BeautifulSoup
  17. import requests
  18. from datetime import datetime
  19. from datetime import timedelta
  20. proxy_list=[]
  21. for line in open("proxylist.txt","r"):
  22. proxy_list.append(line.split(":"))
  23. proxy_check=[]
  24. TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzU4ZGExYTMyMzA4NDUzNDYwYjMwOTQiLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzU4ZGEzNDM5OGJmNTFkM2IyMjc5OTQifQ.8LBET_Bp0BK7W7nCafDQD1BV3nKkmKIXA7iltU0z0VA"
  25. gl = GoLogin({
  26. "token": TOKEN,
  27. 'tmpdir':"/tmp/",
  28. "local":True,
  29. "credentials_enable_service": False,
  30. })
  31. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
  32. debugger_address = gl.start()
  33. chrome_options = Options()
  34. chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  35. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  36. """
  37. profile_id = gl.create({
  38. "name": 'profile_1',
  39. "os": 'mac',
  40. "proxyEnabled": True,
  41. "navigator": {
  42. "language": 'en-US,en;q=0.9,he;q=0.8',
  43. "userAgent": 'MyUserAgent',
  44. "resolution": '1024x768',
  45. "platform": 'darwin',
  46. },
  47. "proxy":{
  48. 'mode': 'http',
  49. 'host': host,
  50. 'port': port,
  51. 'username': "prateep6793",
  52. 'password': "Zing1234",
  53. }
  54. });
  55. """
  56. def creat_new_profile_id(gl, i):
  57. host=proxy_list[i][0]
  58. port=proxy_list[i][1]
  59. profile_id = gl.create({
  60. "name": 'profile_1',
  61. "os": 'mac',
  62. "proxyEnabled": True,
  63. "navigator": {
  64. "language": 'en-US,en;q=0.9,he;q=0.8',
  65. "userAgent": 'MyUserAgent',
  66. "resolution": '1024x768',
  67. "platform": 'darwin',
  68. },
  69. "proxy":{
  70. 'mode': 'http',
  71. 'host': host,
  72. 'port': port,
  73. 'username': "prateep6793",
  74. 'password': "Zing1234",
  75. }
  76. });
  77. return profile_id
  78. def clear_proxy_list(gl,driver):
  79. i=0
  80. while (i<len(proxy_list)):
  81. try:
  82. profile_id=creat_new_profile_id(gl,i)
  83. gl = GoLogin({
  84. "token": TOKEN,
  85. 'profile_id': profile_id,
  86. })
  87. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
  88. debugger_address = gl.start()
  89. chrome_options = Options()
  90. chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  91. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  92. proxy_check[i]=True
  93. except:
  94. print("Error Proxy!")
  95. proxy_check[i]=False
  96. i+=1
  97. # ----------------------------
  98. '''
  99. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
  100. chrome_options = Options()
  101. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  102. '''
  103. start=time.time()
  104. print("ok")
  105. # ----------some def------------------
  106. def find_first_link():
  107. for tmp in driver.find_elements(By.TAG_NAME,'a'):
  108. extracted_link=tmp.get_attribute("href")
  109. if (extracted_link!=None):
  110. if (extracted_link.find("https://"+site+"/")==0):
  111. print(extracted_link)
  112. def create_link(site):
  113. link = 'https://www.google.com/search?q=' + searching_key + '+site%3A' + site + '&sxsrf=ALiCzsbBtWjs-pcdgMW06QAzFmDQAIJemg%3A1663745112460&source=lnt&tbs=cdr%3A1%2Ccd_'
  114. date_from=date-timedelta(days=1)
  115. date_to=date+timedelta(days=1)
  116. year_from=date_from.strftime("%Y")
  117. year_to=date_to.strftime("%Y")
  118. month_from=date_from.strftime("%m")
  119. month_to=date_to.strftime("%m")
  120. day_from=date_from.strftime("%d")
  121. day_to=date_to.strftime("%d")
  122. tmp = "&tbs=cdr:1,cd_min:"+month_to+"/"+day_to+"/"+year_to+",cd_max:"+month_from+"/"+day_from+"/"+year_from
  123. #print(link+tmp)
  124. return link+tmp
  125. def crawl(link,site):
  126. news = {}
  127. t_title = ""
  128. t_description = ""
  129. t_contents = ''
  130. url = requests.get(link)
  131. t_soup = BeautifulSoup(url.text, 'lxml')
  132. if (site=="thanhnien.vn"):
  133. for title in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
  134. t_title=title.text
  135. for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
  136. t_description=description.text
  137. for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
  138. for content in contents.findAll('p'):
  139. t_contents+=content.text+". "
  140. for contents in t_soup.findAll('div', {'class': 'cms-body'}):
  141. for content in contents.findAll('p'):
  142. t_contents+=content.text+". "
  143. if (site=="vnexpress.net"):
  144. for title in t_soup.findAll('h1', {'class': 'title-detail'}):
  145. t_title=title.text
  146. for description in t_soup.findAll('p', {'class': 'description'}):
  147. t_description=description.text
  148. for contents in t_soup.findAll('p', {'class': 'Normal'}):
  149. t_contents+=contents.text+". "
  150. if (site=="tienphong.vn"):
  151. for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
  152. t_title=title.text
  153. for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
  154. t_description=description.text
  155. for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
  156. for content in contents.findAll('p'):
  157. t_contents+=content.text+". "
  158. for contents in t_soup.findAll('td', {'class': 'caption'}):
  159. for content in contents.findAll('p'):
  160. t_contents+=content.text+". "
  161. if (site=="vov.vn"):
  162. for title in t_soup.findAll('div', {'class': 'row article-title'}):
  163. t_title=title.text
  164. for description in t_soup.findAll('div', {'class': 'row article-summary'}):
  165. t_description=description.text
  166. for contents in t_soup.findAll('div', {'class': 'row article-content'}):
  167. for content in contents.findAll('p'):
  168. t_contents+=content.text+". "
  169. for contents in t_soup.findAll('td'):
  170. for content in contents.findAll('p'):
  171. t_contents+=content.text+". "
  172. if (site=="nhandan.vn"):
  173. for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
  174. t_title = title.text
  175. for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
  176. t_description = description.text
  177. for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
  178. for content in contents.findAll('p'):
  179. t_contents += content.text + ". "
  180. for contents in t_soup.findAll('td', {'class': 'caption'}):
  181. for content in contents.findAll('p'):
  182. t_contents += content.text + ". "
  183. if (site=="zingnews.vn"):
  184. for title in t_soup.findAll('h1', {'class': 'the-article-title'}):
  185. t_title = title.text
  186. for description in t_soup.findAll('p', {'class': 'the-article-summary'}):
  187. t_description = description.text
  188. for contents in t_soup.findAll('div', {'class': 'the-article-body'}):
  189. for content in contents.findAll('p'):
  190. t_contents += content.text + ". "
  191. if (site=="tuoitre.vn"):
  192. for title in t_soup.findAll('h1', {'class': 'article-title'}):
  193. t_title = title.text
  194. for description in t_soup.findAll('h2', {'class': 'sapo'}):
  195. t_description = description.text
  196. for contents in t_soup.findAll('div', {'class': 'content fck'}):
  197. for content in contents.findAll('p'):
  198. t_contents += content.text + ". "
  199. if (site=="cand.com.vn"):
  200. for title in t_soup.findAll('h1', {'class': 'box-title-detail entry-title'}):
  201. t_title = title.text
  202. for description in t_soup.findAll('div', {'class': 'box-des-detail this-one'}):
  203. t_description = description.text
  204. for contents in t_soup.findAll('div', {'class': 'detail-content-body'}):
  205. for content in contents.findAll('p'):
  206. t_contents += content.text + ". "
  207. if (site == "vtv.vn"):
  208. for title in t_soup.findAll('h1', {'class': 'title_detail'}):
  209. t_title = title.text
  210. for description in t_soup.findAll('h2', {'class': 'sapo'}):
  211. t_description = description.text
  212. for contents in t_soup.findAll('div', {'class': 'ta-justify'}):
  213. for content in contents.findAll('p'):
  214. t_contents += content.text + ". "
  215. tmp = len(content.text + ". ")
  216. if (site == "24h.com.vn"):
  217. for title in t_soup.findAll('h1', {'class': 'clrTit bld tuht_show'}):
  218. t_title = title.text
  219. for description in t_soup.findAll('h2', {'class': 'ctTp tuht_show'}):
  220. t_description = description.text
  221. for contents in t_soup.findAll('article', {'class': 'nwsHt nwsUpgrade'}):
  222. for content in contents.findAll('p'):
  223. t_contents += content.text + ". "
  224. if (site == "dantri.com.vn"):
  225. for title in t_soup.findAll('h1', {'class': 'title-page detail'}):
  226. t_title = title.text
  227. for description in t_soup.findAll('h2', {'class': 'singular-sapo'}):
  228. t_description = description.text
  229. for contents in t_soup.findAll('div', {'class': 'singular-content'}):
  230. for content in contents.findAll('p'):
  231. t_contents += content.text + ". "
  232. if (site == "baophapluat.vn"):
  233. for title in t_soup.findAll('h1', {'class': 'article__title cms-title'}):
  234. t_title = title.text
  235. for description in t_soup.findAll('div', {'class': 'article__sapo cms-desc'}):
  236. t_description = description.text
  237. for contents in t_soup.findAll('div', {'class': 'article__body cms-body'}):
  238. for content in contents.findAll('p'):
  239. t_contents += content.text + ". "
  240. for contents in t_soup.findAll('td', {'class': 'caption'}):
  241. for content in contents.findAll('p'):
  242. t_contents += content.text + ". "
  243. if (site == "kenh14.vn"):
  244. for title in t_soup.findAll('h1', {'class': 'kbwc-title'}):
  245. t_title = title.text
  246. for description in t_soup.findAll('h2', {'class': 'knc-sapo'}):
  247. t_description = description.text
  248. for contents in t_soup.findAll('div', {'class': 'knc-content'}):
  249. for content in contents.findAll('p'):
  250. t_contents += content.text + ". "
  251. if (site == "laodong.vn"):
  252. for title in t_soup.findAll('h1', {'class': 'title'}):
  253. t_title = title.text
  254. for description in t_soup.findAll('div', {'class': 'chappeau'}):
  255. t_description = description.text
  256. for contents in t_soup.findAll('div', {'class': 'art-body'}):
  257. for content in contents.findAll('p'):
  258. t_contents += content.text + ". "
  259. if (site == "qdnd.vn"):
  260. for title in t_soup.findAll('h1', {'class': 'post-title'}):
  261. t_title = title.text
  262. for description in t_soup.findAll('div', {'class': 'post-summary'}):
  263. t_description = description.text
  264. for contents in t_soup.findAll('div', {'class': 'post-content'}):
  265. for content in contents.findAll('p'):
  266. t_contents += content.text + ". "
  267. if (site == "vtc.vn"):
  268. for title in t_soup.findAll('h1', {'class': 'font28 bold lh-1-3'}):
  269. t_title = title.text
  270. for description in t_soup.findAll('h2', {'class': 'font18 bold inline-nb'}):
  271. t_description = description.text
  272. for contents in t_soup.findAll('div', {'class': 'edittor-content box-cont mt15 clearfix '}):
  273. for content in contents.findAll('p'):
  274. t_contents += content.text + ". "
  275. if (site == "toquoc.vn"):
  276. for title in t_soup.findAll('h1', {'class': 'entry-title'}):
  277. t_title = title.text
  278. for description in t_soup.findAll('h2', {'class': 'sapo'}):
  279. t_description = description.text
  280. for contents in t_soup.findAll('div', {'data-role': 'content'}):
  281. for content in contents.findAll('p'):
  282. t_contents += content.text + ". "
  283. news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
  284. if t_title=="": return {}
  285. return news
  286. #-----------------------
  287. sites={'vnexpress.net','thanhnien.vn','tienphong.vn',
  288. 'vov.vn','nhandan.vn','zingnews.vn',
  289. 'tuoitre.vn','cand.com.vn','vtv.vn',
  290. '24h.com.vn','dantri.com.vn','baophapluat.vn',
  291. 'kenh14.vn','laodong.vn','qdnd.vn','vtc.vn',
  292. 'toquoc.vn'}
  293. fi=open("baomoi_testing_crawling.txt","r")
  294. fo=open("testing.txt",'w')
  295. #--------------------------
  296. clear_proxy_list()
  297. for line in fi.readlines():
  298. a=json.loads(line)
  299. t_str=ner(a["title"])
  300. #---
  301. t_date=a["date"]
  302. year=int(t_date[0:4])
  303. month=int(t_date[5:7])
  304. day=int(t_date[8:10])
  305. date=datetime(year,month,day)
  306. #---
  307. searching_key= ''
  308. for words in t_str:
  309. if (words[1]=="N") or (words[1]=="Np"):
  310. searching_key+= '"' + words[0] + '"' + "%2B"
  311. searching_key=searching_key.replace(" ", "+")
  312. searching_key= searching_key[0:len(searching_key) - 3]
  313. source = [line]
  314. for site in sites:
  315. #print(create_link(site))
  316. check_link=create_link(site)
  317. driver.get(check_link)
  318. time.sleep(1)
  319. #print(check_link)
  320. if (driver.current_url.find("sorry")!=-1):
  321. print("Error!")
  322. else:
  323. print(driver.current_url)
  324. """
  325. try:
  326. i+=1
  327. driver.close()
  328. gl = GoLogin({
  329. "token": TOKEN,
  330. 'profile_id': creat_new_profile_id(gl,i),
  331. })
  332. debugger_address = gl.start()
  333. chrome_options = Options()
  334. chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  335. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  336. driver.get(check_link)
  337. except:
  338. pass
  339. #print("#---------------------"+"\n")
  340. '''
  341. while (driver.current_url!=check_link):
  342. driver.delete_all_cookies()
  343. driver.refresh()
  344. driver.get(check_link)
  345. '''
  346. """
  347. #driver.execute_script("window.open("+"'"+create_link(site)+"'"+");")
  348. for tmp in driver.find_elements(By.TAG_NAME, 'a'):
  349. extracted_link = tmp.get_attribute("href")
  350. if (extracted_link != None):
  351. if (extracted_link.find("https://" + site + "/") == 0):
  352. #print(extracted_link)
  353. print(extracted_link)
  354. news = crawl(extracted_link,site)
  355. if news!={}:
  356. source.append(news)
  357. break
  358. #time.sleep(2)
  359. #crawling(create_link(site))
  360. # print(source)
  361. fo.write(json.dumps(source, ensure_ascii=False))
  362. fo.write('\n')
  363. print("--- %s seconds ---" % (time.time() - start))