testing_scrape.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. from bs4 import BeautifulSoup
  2. import requests
  3. from sys import platform
  4. import time
  5. # from sys import platform
  6. #!/usr/bin/python
  7. # -*- coding: utf8 -*-
  8. from selenium import webdriver
  9. from selenium.webdriver.chrome.options import Options
  10. from pygologin.gologin import GoLogin
  11. from selenium.webdriver.common.by import By
  12. from selenium.webdriver.support.ui import WebDriverWait
  13. from selenium.webdriver.support import expected_conditions as EC
  14. from selenium.common.exceptions import NoSuchElementException
  15. from selenium.common.exceptions import StaleElementReferenceException
  16. import json
  17. from underthesea import ner
  18. from bs4 import BeautifulSoup
  19. import requests
  20. from googlesearch import *
  21. from datetime import datetime
  22. from datetime import timedelta
  23. """
  24. TOKEN="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI2MzNkZDJlOWYwMzIwMjBkYWQwNDU2ZTciLCJ0eXBlIjoiZGV2Iiwiand0aWQiOiI2MzNkZDM0YWM5OWFmMmMzMzdkMjNmNGQifQ.7UmxqoGmN25EwG1DmN-2aJZqbBUY3R4hgKJciKgUwRg"
  25. link="https://ipinfo.io/"
  26. gl = GoLogin({
  27. "token": TOKEN,
  28. 'tmpdir':"/tmp/",
  29. "local":True,
  30. "credentials_enable_service": False,
  31. })
  32. profile_id = gl.create({
  33. "name": 'profile_1',
  34. "os": 'mac',
  35. "proxyEnabled": True,
  36. "navigator": {
  37. "language": 'en-US,en;q=0.9,he;q=0.8',
  38. "userAgent": 'MyUserAgent',
  39. "resolution": '1024x768',
  40. "platform": 'darwin',
  41. },
  42. "proxy":{
  43. 'mode': 'http',
  44. 'host': "139.99.237.62",
  45. 'port': "80",
  46. 'username': "",
  47. 'password': "",
  48. }
  49. });
  50. 'host': "139.99.237.62",
  51. 'port': ,
  52. 'username': "",
  53. 'password': "",
  54. gl = GoLogin({
  55. "token": TOKEN,
  56. 'profile_id':profile_id,
  57. })
  58. chrome_driver_path = "/Users/nguyenductai/Downloads/chromedriver"
  59. debugger_address = gl.start()
  60. chrome_options = Options()
  61. chrome_options.add_experimental_option("debuggerAddress", debugger_address)
  62. driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
  63. driver.get(link)
  64. gl.delete(profile_id)
  65. driver.close()
  66. print("end session!")
  67. # ----------------------------
  68. """
  69. link="https://toquoc.vn/van-hoa-khong-co-su-cao-thap-nho-hay-lon-ma-chi-co-su-da-dang-net-dac-sac-tieu-bieu-can-duoc-ton-trong-ton-vinh-phat-huy-giu-gin-20221006225030042.htm"
  70. news = {}
  71. t_title = ""
  72. t_description = ""
  73. t_contents = ''
  74. url = requests.get(link)
  75. t_soup = BeautifulSoup(url.text, 'lxml')
  76. for title in t_soup.findAll('h1', {'class': 'entry-title'}):
  77. t_title = title.text
  78. for description in t_soup.findAll('h2', {'class': 'sapo'}):
  79. t_description = description.text
  80. for contents in t_soup.findAll('div', {'data-role': 'content'}):
  81. for content in contents.findAll('p'):
  82. t_contents += content.text + ". "
  83. news = {'title': t_title, 'description': t_description, 'content': t_contents, 'category': "",'date':""}
  84. print(news)