extract_data.py 782 B

1234567891011121314151617181920212223242526272829303132333435
  1. from underthesea import ner
  2. from underthesea import pos_tag
  3. from underthesea import word_tokenize
  4. import json
  5. from bs4 import BeautifulSoup
  6. import requests
  7. fi=open("baomoi_testing_crawling.txt","r")
  8. i=0
  9. for line in fi.readlines():
  10. a=json.loads(line)
  11. t_str=ner(a["title"])
  12. #---
  13. t_date=a["date"]
  14. year=t_date[0:4]
  15. month=t_date[5:7]
  16. day=t_date[8:10]
  17. #---
  18. print(t_str)
  19. searching_key= ''
  20. for words in t_str:
  21. if (words[1]=="N") or (words[1]=="Np"):
  22. searching_key+= '"' + words[0] + '"' + "%2B"
  23. searching_key=searching_key.replace(" ", "+")
  24. searching_key= searching_key[0:len(searching_key) - 3]
  25. '''
  26. print(searching_key)
  27. print(year,' ',month,' ',day)
  28. print(i)
  29. '''
  30. i+=1
  31. if (i==2): break