crawl_links.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. import time
  2. # from sys import platform
  3. #!/usr/bin/python
  4. # -*- coding: utf8 -*-
  5. from bs4 import BeautifulSoup
  6. import bs4
  7. import requests
  8. from urllib.parse import urljoin
  9. import time
  10. base = 'https://baomoi.com/'
  11. #start=time.time()
  12. f=open("links2.txt","w")
  13. def scraping_link(link):
  14. url = requests.get(link)
  15. soup = BeautifulSoup(url.content, 'lxml')
  16. for links in soup.findAll('div',{'class':'bm_O'}):
  17. for a in links.findAll('a', href=True):
  18. f.write(urljoin(base,a['href']))
  19. f.write('\n')
  20. for i in range(1,168):
  21. start = time.time()
  22. print(i)
  23. #scraping_link("https://baomoi.com/tin-moi/trang+"+str(i)+".epi")
  24. scraping_link("https://baomoi.com/the-gioi/trang+"+str(i)+".epi")
  25. #----
  26. scraping_link("https://baomoi.com/thoi-su/trang+"+str(i)+".epi")
  27. scraping_link("https://baomoi.com/giao-thong/trang+"+str(i)+".epi")
  28. scraping_link("https://baomoi.com/moi-truong-khi-hau/trang+"+str(i)+".epi")
  29. #----
  30. scraping_link("https://baomoi.com/nghe-thuat/trang+"+str(i)+".epi")
  31. scraping_link("https://baomoi.com/am-thuc/trang+"+str(i)+".epi")
  32. scraping_link("https://baomoi.com/du-lich/trang+"+str(i)+".epi")
  33. #----
  34. scraping_link("https://baomoi.com/lao-dong-viec-lam/trang+"+str(i)+".epi")
  35. scraping_link("https://baomoi.com/tai-chinh/trang+"+str(i)+".epi")
  36. scraping_link("https://baomoi.com/chung-khoan/trang+"+str(i)+".epi")
  37. scraping_link("https://baomoi.com/kinh-doanh/trang+"+str(i)+".epi")
  38. #-----
  39. scraping_link("https://baomoi.com/hoc-bong-du-hoc/trang+" + str(i) + ".epi")
  40. scraping_link("https://baomoi.com/dao-tao-thi-cu/trang+" + str(i) + ".epi")
  41. #-----
  42. scraping_link("https://baomoi.com/bong-da-quoc-te/trang+" + str(i) + ".epi")
  43. scraping_link("https://baomoi.com/bong-da-viet-nam/trang+" + str(i) + ".epi")
  44. scraping_link("https://baomoi.com/quan-vot/trang+" + str(i) + ".epi")
  45. #---
  46. scraping_link("https://baomoi.com/am-nhac/trang+" + str(i) + ".epi")
  47. scraping_link("https://baomoi.com/thoi-trang/trang+" + str(i) + ".epi")
  48. scraping_link("https://baomoi.com/dien-anh-truyen-hinh/trang+" + str(i) + ".epi")
  49. #---
  50. scraping_link("https://baomoi.com/an-ninh-trat-tu/trang+" + str(i) + ".epi")
  51. scraping_link("https://baomoi.com/hinh-su-dan-su/trang+" + str(i) + ".epi")
  52. # ---
  53. scraping_link("https://baomoi.com/cntt-vien-thong/trang+" + str(i) + ".epi")
  54. scraping_link("https://baomoi.com/thiet-bi-phan-cung/trang+" + str(i) + ".epi")
  55. # ---
  56. scraping_link("https://baomoi.com/khoa-hoc/trang+" + str(i) + ".epi")
  57. # ---
  58. scraping_link("https://baomoi.com/dinh-duong-lam-dep/trang+" + str(i) + ".epi")
  59. scraping_link("https://baomoi.com/tinh-yeu-hon-nhan/trang+" + str(i) + ".epi")
  60. scraping_link("https://baomoi.com/suc-khoe-y-te/trang+" + str(i) + ".epi")
  61. # ---
  62. scraping_link("https://baomoi.com/xe-co/trang+" + str(i) + ".epi")
  63. # ---
  64. scraping_link("https://baomoi.com/quan-ly-quy-hoach/trang+" + str(i) + ".epi")
  65. scraping_link("https://baomoi.com/khong-gian-kien-truc/trang+" + str(i) + ".epi")
  66. print("--- %s seconds ---" % (time.time() - start))
  67. f.close()