tmp.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import sys
  2. import time
  3. # from sys import platform
  4. #!/usr/bin/python
  5. # -*- coding: utf8 -*-
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.options import Options
  8. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  9. from pygologin.gologin import GoLogin
  10. from selenium.webdriver.common.by import By
  11. from selenium.webdriver.support.ui import WebDriverWait
  12. from selenium.webdriver.support import expected_conditions as EC
  13. from selenium.common.exceptions import NoSuchElementException
  14. from selenium.common.exceptions import StaleElementReferenceException
  15. import json
  16. from urllib.parse import urljoin
  17. from bs4 import BeautifulSoup
  18. import requests
  19. '''gl = GoLogin({
  20. "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MmY3Yjk3NGQxZGNkYmJjYzA5ODUyODciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MmY3Y2E2OTgwZGRjMDU1YjliZTVlMjMifQ.__GwUyY80hIVJ8o2Ak0wntHYizNwWrm42h-k7q0xxJE",
  21. "profile_id": "62f7b974d1dcdb43cb985289",
  22. # "port": random_port
  23. }'''
  24. capa = DesiredCapabilities.CHROME
  25. capa["pageLoadStrategy"] = "none"
  26. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
  27. #debugger_address = gl.start()
  28. chrome_options = Options()
  29. chrome_options.add_experimental_option("useAutomationExtension", False)
  30. chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
  31. #chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  32. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options, desired_capabilities=capa)
  33. #driver=webdriver.Chrome("/Users/nguyenductai/Downloads/chromedriver2")'''
  34. # ----------------------------
  35. f=open("linksthethao.txt", "w")
  36. for i in range(2,3):
  37. url='https://thanhnien.vn/thoi-su/chinh-tri/?trang='+str(i)
  38. url1 = requests.get(url)
  39. soup = BeautifulSoup(url1.content, 'lxml')
  40. #items = soup.findAll('item')
  41. for links in soup.findAll('article', {'class': "story"}):
  42. for a in links.findAll('a', {'class': "story__title cms-link"} ,href=True):
  43. f.write(a['href'])
  44. f.write('\n')
  45. print(i,'\n')
  46. """
  47. url1=requests.get('https://vnexpress.net/rss/the-thao.rss')
  48. soup=BeautifulSoup(url1.content, 'xml')
  49. items=soup.find_all('item')
  50. wait=WebDriverWait(driver,200)
  51. '''driver.get("https://vnexpress.net/neymar-mbappe-va-vu-penaltygate-2-0-4501139.html")
  52. wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/h1')))
  53. str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
  54. str=str[:str.rfind('\n')]
  55. str=str[:str.rfind('\n')]
  56. str=str[:str.rfind('\n')]
  57. print(str)'''
  58. i=0
  59. for item in items:
  60. i+=1
  61. title=item.title.text
  62. link=item.link.text
  63. #print("Link: ", link, '\n\n')
  64. url2=requests.get(link)
  65. #---------
  66. t_soup=BeautifulSoup(url2.content,'lxml')
  67. for headline in t_soup.findAll('h1',{'class':'title-detail'}):
  68. f.write(headline.text)
  69. f.write('\n')
  70. for description in t_soup.findAll('p',{'class':'description'}):
  71. f.write(description.text)
  72. f.write('\n')
  73. str=''
  74. for normal in t_soup.findAll('p', {'class': 'Normal'}):
  75. str+=normal.text+'\n'
  76. str = str[:str.rfind('\n')]
  77. str = str[:str.rfind('\n')]
  78. str+='\n'
  79. f.write(str)
  80. #print('\n')
  81. print(i,'\n')
  82. #print(t_soup)
  83. #-----------
  84. '''driver.get(link)
  85. time.sleep(1)
  86. wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/article')))
  87. str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
  88. str = str[:str.rfind('\n')]
  89. str = str[:str.rfind('\n')]
  90. str = str[:str.rfind('\n')]
  91. str+='\n'
  92. f.write(str)
  93. print(i)
  94. #driver.execute_script("window.stop();")
  95. driver.refresh()'''
  96. #-------------
  97. """
  98. f.close()