gdelt

24e150af · 薛凌堃 · 30e0cd69 · 24e150af
--- a/comData/dingzhi/gdelt.py
+++ b/comData/dingzhi/gdelt.py
+import re,requests,goose3,time,datetime,json
+from goose3.article import Article
+import pandas as pd
+from goose3 import Goose
+from kafka import KafkaProducer
+import json
+import base64
+from bs4 import BeautifulSoup
+from retry import retry
+from selenium import webdriver
+import os,sys
+import zipfile
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+
+
+#首先，通过zipfile模块打开指定位置zip文件
+#传入文件名列表，及列表文件所在路径，及存储路径
+def Decompression(files,file_path):
+    os.getcwd()#当前路径
+    os.chdir(file_path)#转到路径
+    for file_name in files:
+        print(file_name)
+        r = zipfile.is_zipfile(file_name)#判断是否解压文件
+        if r:
+            zpfd = zipfile.ZipFile(file_name)#读取压缩文件
+            os.chdir(save_path)#转到存储路径
+            zpfd.extractall()
+            zpfd.close()
+            os.chdir(file_path)#之前没加这句，代码只能解压第一个压缩文件
+
+def files_save():
+    for file_path,sub_dirs,files in os.walk(open_path):#获取所有文件名，路径
+        print(file_path,sub_dirs,files)
+        Decompression(files,file_path)
+
+@retry(tries=3, delay=1)
+def downloadfile():
+	# 下载zip文件
+	chrome_driver = r'D:\cmd100\chromedriver.exe'
+	path = Service(chrome_driver)
+	chrome_options = webdriver.ChromeOptions()
+	chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
+	browser = webdriver.Chrome(service=path, chrome_options=chrome_options)
+	# browser = webdriver.Chrome(chromedriver)
+
+	url = 'http://data.gdeltproject.org/events/index.html'
+	browser.get(url)#跳到指定页面
+
+	page_source = browser.page_source#获取页面信息
+	time.sleep(1)
+	browser.quit()
+	try:
+		soup = BeautifulSoup(page_source, 'html.parser')
+		list_all = soup.find('ul').find_all('li')[3]
+		href = list_all.find('a').text
+		header = {
+			'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+			'Accept-Encoding': 'gzip, deflate',
+			'Accept-Language': 'zh-CN,zh;q=0.9',
+			'Cache-Control': 'max-age=0',
+			'Connection': 'keep-alive',
+			'Host': 'data.gdeltproject.org',
+			'If-Modified-Since': 'Fri, 19 Jan 2024 07:00:22 GMT',
+			'If-None-Match': '"4474b218588077d29447b0c6f0a5a498"',
+			'Upgrade-Insecure-Requests': '1',
+			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+		}
+		one_url = "http://data.gdeltproject.org/events/"+href
+		resp_content = requests.get(url=one_url, headers=header).content
+	except:
+		raise
+
+	with open(f'D:/gdelt/file_zip/{href}', 'wb') as f:
+		f.write(resp_content)
+	print(f"{href}:获取完成")
+	return href
+
+
+def spider_work(href):
+	# 通过csv中的URL爬取内容
+	# now = datetime.datetime.now()
+	# yes1 = now + datetime.timedelta(days = -1)
+	#
+	# time_day1 = yes1.strftime('%Y-%m-%d')
+	# dete = time_day1.replace('-','')
+	dete = href[:8]
+
+	df_all = pd.read_csv(r'D:\gdelt\file_csv\{}.export.CSV'.format(dete),delimiter="\t",header=None)
+	df_url = df_all.drop_duplicates(subset=[df_all.columns[-1]], keep='first', inplace= False)
+
+
+	#开始爬虫
+	goose = Goose()
+	for news_url,news_date in zip(df_url[df_url.columns[-1]],df_url[df_url.columns[-2]]):
+		time_tt = datetime.datetime.strptime(str(news_date),'%Y%m%d')
+		news_time = time_tt.strftime("%Y-%m-%d")
+
+		try:
+			article = goose.extract(url=news_url)
+
+			news_title = article.infos['title']
+			news_content = article.infos['cleaned_text']
+
+			aa_dict = {
+				'author': '',
+				'authorRaw': '',
+				'content': news_content,
+				'contentRaw': '',
+				'contentWithTag': '',
+				'contentWithTagRaw': '',
+				'createDate': '',
+				'id': '',
+				'sid': '1603650832176173058',
+				'lang': '',
+				'langRaw': '',
+				'origin': '',
+				'originRaw': '',
+				'publishDate': news_time,
+				'sourceAddress': news_url,
+				'summary': '',
+				'summaryRaw': '',
+				'keyWords': '',
+				'title': news_title,
+				'titleRaw': '',
+				'source': '',
+				'type': '26',
+				'labels': [],
+			}
+			try:
+				producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
+				kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
+				print(kafka_result.get(timeout=10))
+				#print(aa_dict)
+			except:
+				pass
+		except Exception as e:
+			pass
+
+		time.sleep(1)
+
+if __name__ == "__main__":
+	# 下载文件
+	href = downloadfile()
+
+	# zip解压
+	# open_path=r'C:\tyh\gdelt\file_zip'#原压缩文件目录
+	# save_path=r'C:\tyh\gdelt\file_csv'#新的解压文件保存目录
+	open_path = r'D:\gdelt\file_zip'  # 原压缩文件目录
+	save_path = r'D:\gdelt\file_csv'  # 新的解压文件保存目录
+	os.chdir(open_path)  # 转到路径
+	files_save()
+
+	#爬取数据
+	# href = '20240118.export.CSV'
+	spider_work(href)
\ No newline at end of file