提交 24e150af 作者: 薛凌堃

gdelt

上级 30e0cd69
import re,requests,goose3,time,datetime,json
from goose3.article import Article
import pandas as pd
from goose3 import Goose
from kafka import KafkaProducer
import json
import base64
from bs4 import BeautifulSoup
from retry import retry
from selenium import webdriver
import os,sys
import zipfile
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
#首先,通过zipfile模块打开指定位置zip文件
#传入文件名列表,及列表文件所在路径,及存储路径
def Decompression(files,file_path):
os.getcwd()#当前路径
os.chdir(file_path)#转到路径
for file_name in files:
print(file_name)
r = zipfile.is_zipfile(file_name)#判断是否解压文件
if r:
zpfd = zipfile.ZipFile(file_name)#读取压缩文件
os.chdir(save_path)#转到存储路径
zpfd.extractall()
zpfd.close()
os.chdir(file_path)#之前没加这句,代码只能解压第一个压缩文件
def files_save():
for file_path,sub_dirs,files in os.walk(open_path):#获取所有文件名,路径
print(file_path,sub_dirs,files)
Decompression(files,file_path)
@retry(tries=3, delay=1)
def downloadfile():
# 下载zip文件
chrome_driver = r'D:\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
# browser = webdriver.Chrome(chromedriver)
url = 'http://data.gdeltproject.org/events/index.html'
browser.get(url)#跳到指定页面
page_source = browser.page_source#获取页面信息
time.sleep(1)
browser.quit()
try:
soup = BeautifulSoup(page_source, 'html.parser')
list_all = soup.find('ul').find_all('li')[3]
href = list_all.find('a').text
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'data.gdeltproject.org',
'If-Modified-Since': 'Fri, 19 Jan 2024 07:00:22 GMT',
'If-None-Match': '"4474b218588077d29447b0c6f0a5a498"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
one_url = "http://data.gdeltproject.org/events/"+href
resp_content = requests.get(url=one_url, headers=header).content
except:
raise
with open(f'D:/gdelt/file_zip/{href}', 'wb') as f:
f.write(resp_content)
print(f"{href}:获取完成")
return href
def spider_work(href):
# 通过csv中的URL爬取内容
# now = datetime.datetime.now()
# yes1 = now + datetime.timedelta(days = -1)
#
# time_day1 = yes1.strftime('%Y-%m-%d')
# dete = time_day1.replace('-','')
dete = href[:8]
df_all = pd.read_csv(r'D:\gdelt\file_csv\{}.export.CSV'.format(dete),delimiter="\t",header=None)
df_url = df_all.drop_duplicates(subset=[df_all.columns[-1]], keep='first', inplace= False)
#开始爬虫
goose = Goose()
for news_url,news_date in zip(df_url[df_url.columns[-1]],df_url[df_url.columns[-2]]):
time_tt = datetime.datetime.strptime(str(news_date),'%Y%m%d')
news_time = time_tt.strftime("%Y-%m-%d")
try:
article = goose.extract(url=news_url)
news_title = article.infos['title']
news_content = article.infos['cleaned_text']
aa_dict = {
'author': '',
'authorRaw': '',
'content': news_content,
'contentRaw': '',
'contentWithTag': '',
'contentWithTagRaw': '',
'createDate': '',
'id': '',
'sid': '1603650832176173058',
'lang': '',
'langRaw': '',
'origin': '',
'originRaw': '',
'publishDate': news_time,
'sourceAddress': news_url,
'summary': '',
'summaryRaw': '',
'keyWords': '',
'title': news_title,
'titleRaw': '',
'source': '',
'type': '26',
'labels': [],
}
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
#print(aa_dict)
except:
pass
except Exception as e:
pass
time.sleep(1)
if __name__ == "__main__":
# 下载文件
href = downloadfile()
# zip解压
# open_path=r'C:\tyh\gdelt\file_zip'#原压缩文件目录
# save_path=r'C:\tyh\gdelt\file_csv'#新的解压文件保存目录
open_path = r'D:\gdelt\file_zip' # 原压缩文件目录
save_path = r'D:\gdelt\file_csv' # 新的解压文件保存目录
os.chdir(open_path) # 转到路径
files_save()
#爬取数据
# href = '20240118.export.CSV'
spider_work(href)
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论