_ndtai_
/
CyberItellect_Mos


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							import sys
import time
# from sys import platform
#!/usr/bin/python

# -*- coding: utf8 -*-

from bs4 import BeautifulSoup
import requests
import time
import json

start=time.time()

fi=open("links2.txt","r")
fo=open("","a")

news={}
#news[len(news)] = {'title': title, 'link': link, 'content': title + ' ' + description, 'category': category, 'page':page}

i=0

start=time.time()

for line in fi.readlines():
    i+=1
    if (i<69878): continue
    link=line.strip()
    try:
        url=requests.get(link)
        if url.history: continue
        t_soup = BeautifulSoup(url.text, 'lxml')
        t_content=""
        for title in t_soup.findAll('h1', {'class': 'bm_J'}):
            t_title=title.text
        for description in t_soup.findAll('h3', {'class': 'bm_Ak bm_J'}):
            t_description = description.text
        for date in t_soup.findAll('time'):
            if date.has_attr('datetime'):
                t_date=date['datetime']
        for category in t_soup.findAll('a', {'class': 'bm_y'}):
            t_category=category.text
        for content in t_soup.findAll('p', {'class': 'bm_Y'}):
            t_content+=content.text+" "
        for content in t_soup.findAll('p', {'class': 'bm_Y bm_FP'}):
            t_content+=content.text+" "
        news = {'title': t_title, 'description': t_description, 'content': t_content, 'category': t_category, 'date': t_date}
        fo.write(json.dumps(news, ensure_ascii=False))
        fo.write('\n')
        print(i)
    except:
        print("Error!")

fi.close()
fo.close()

print("--- %s seconds ---" % (time.time() - start))