| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- import sys
- import time
- # from sys import platform
- #!/usr/bin/python
- # -*- coding: utf8 -*-
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- from pygologin.gologin import GoLogin
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.common.exceptions import NoSuchElementException
- from selenium.common.exceptions import StaleElementReferenceException
- import json
- from urllib.parse import urljoin
- from bs4 import BeautifulSoup
- import requests
- '''gl = GoLogin({
- "token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MmY3Yjk3NGQxZGNkYmJjYzA5ODUyODciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MmY3Y2E2OTgwZGRjMDU1YjliZTVlMjMifQ.__GwUyY80hIVJ8o2Ak0wntHYizNwWrm42h-k7q0xxJE",
- "profile_id": "62f7b974d1dcdb43cb985289",
- # "port": random_port
- }'''
- capa = DesiredCapabilities.CHROME
- capa["pageLoadStrategy"] = "none"
- chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver2"
- #debugger_address = gl.start()
- chrome_options = Options()
- chrome_options.add_experimental_option("useAutomationExtension", False)
- chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
- #chrome_options.add_experimental_option("debuggerAddress", debugger_address)
- driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options, desired_capabilities=capa)
- #driver=webdriver.Chrome("/Users/nguyenductai/Downloads/chromedriver2")'''
- # ----------------------------
- f=open("linksthethao.txt", "w")
- for i in range(2,3):
- url='https://thanhnien.vn/thoi-su/chinh-tri/?trang='+str(i)
- url1 = requests.get(url)
- soup = BeautifulSoup(url1.content, 'lxml')
- #items = soup.findAll('item')
- for links in soup.findAll('article', {'class': "story"}):
- for a in links.findAll('a', {'class': "story__title cms-link"} ,href=True):
- f.write(a['href'])
- f.write('\n')
- print(i,'\n')
- """
- url1=requests.get('https://vnexpress.net/rss/the-thao.rss')
- soup=BeautifulSoup(url1.content, 'xml')
- items=soup.find_all('item')
- wait=WebDriverWait(driver,200)
- '''driver.get("https://vnexpress.net/neymar-mbappe-va-vu-penaltygate-2-0-4501139.html")
- wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/h1')))
- str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
- str=str[:str.rfind('\n')]
- str=str[:str.rfind('\n')]
- str=str[:str.rfind('\n')]
- print(str)'''
- i=0
- for item in items:
- i+=1
- title=item.title.text
- link=item.link.text
- #print("Link: ", link, '\n\n')
- url2=requests.get(link)
- #---------
- t_soup=BeautifulSoup(url2.content,'lxml')
- for headline in t_soup.findAll('h1',{'class':'title-detail'}):
- f.write(headline.text)
- f.write('\n')
- for description in t_soup.findAll('p',{'class':'description'}):
- f.write(description.text)
- f.write('\n')
- str=''
- for normal in t_soup.findAll('p', {'class': 'Normal'}):
- str+=normal.text+'\n'
- str = str[:str.rfind('\n')]
- str = str[:str.rfind('\n')]
- str+='\n'
- f.write(str)
- #print('\n')
- print(i,'\n')
- #print(t_soup)
- #-----------
- '''driver.get(link)
- time.sleep(1)
- wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/section[4]/div/div[2]/article')))
- str = driver.find_element(By.XPATH, '/html/body/section[4]/div/div[2]/article').text
- str = str[:str.rfind('\n')]
- str = str[:str.rfind('\n')]
- str = str[:str.rfind('\n')]
- str+='\n'
- f.write(str)
- print(i)
- #driver.execute_script("window.stop();")
- driver.refresh()'''
- #-------------
- """
- f.close()
|