crawl_from_Baomoi.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import sys
  2. import time
  3. # from sys import platform
  4. #!/usr/bin/python
  5. # -*- coding: utf8 -*-
  6. from bs4 import BeautifulSoup
  7. import requests
  8. import time
  9. import json
  10. start=time.time()
  11. fi=open("links2.txt","r")
  12. fo=open("","a")
  13. news={}
  14. #news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}
  15. i=0
  16. start=time.time()
  17. for line in fi.readlines():
  18. i+=1
  19. if (i<69878): continue
  20. link=line.strip()
  21. try:
  22. url=requests.get(link)
  23. if url.history: continue
  24. t_soup = BeautifulSoup(url.text, 'lxml')
  25. t_content=""
  26. for title in t_soup.findAll('h1', {'class': 'bm_J'}):
  27. t_title=title.text
  28. for description in t_soup.findAll('h3', {'class': 'bm_Ak bm_J'}):
  29. t_description = description.text
  30. for date in t_soup.findAll('time'):
  31. if date.has_attr('datetime'):
  32. t_date=date['datetime']
  33. for category in t_soup.findAll('a', {'class': 'bm_y'}):
  34. t_category=category.text
  35. for content in t_soup.findAll('p', {'class': 'bm_Y'}):
  36. t_content+=content.text+" "
  37. for content in t_soup.findAll('p', {'class': 'bm_Y bm_FP'}):
  38. t_content+=content.text+" "
  39. news = {'title': t_title, 'description': t_description, 'content': t_content, 'category': t_category, 'date': t_date}
  40. fo.write(json.dumps(news, ensure_ascii=False))
  41. fo.write('\n')
  42. print(i)
  43. except:
  44. print("Error!")
  45. fi.close()
  46. fo.close()
  47. print("--- %s seconds ---" % (time.time() - start))