crawldatanews3.py 961 B

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import sys
  2. import time
  3. # from sys import platform
  4. #!/usr/bin/python
  5. # -*- coding: utf8 -*-
  6. from bs4 import BeautifulSoup
  7. import requests
  8. import time
  9. start=time.time()
  10. fi=open("linksthethao.txt","r")
  11. fo=open("baothethao2.txt","w")
  12. i=0
  13. for line in fi.readlines():
  14. link=line.strip()
  15. url=requests.get(link)
  16. t_soup = BeautifulSoup(url.text, 'lxml')
  17. for headline in t_soup.findAll('h1', {'class': 'details__headline cms-title'}):
  18. fo.write(headline.text)
  19. fo.write('\n')
  20. for description in t_soup.findAll('div', {'class': 'sapo cms-desc'}):
  21. fo.write(description.text)
  22. fo.write('\n')
  23. str = ''
  24. for contents in t_soup.findAll('div', {'class': 'cms-body detail'}):
  25. for content in contents.findAll('p'):
  26. fo.write(content.text)
  27. fo.write('\n')
  28. i+=1
  29. print(i)
  30. if (i==50): break
  31. fi.close()
  32. fo.close()
  33. print("--- %s seconds ---" % (time.time() - start))