提交 ea2b7efb 作者: 丁双波

Merge remote-tracking branch 'origin/master'

......@@ -678,9 +678,19 @@ class BaseCore:
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self,social_code,cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
......
......@@ -116,33 +116,6 @@ def NoticeEnterprise_task():
print('定时采集异常', e)
pass
#企业年报
def AnnualEnterprise():
cnx,cursor = connectSql()
# 获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('=======')
for item in gn_social_list:
r.rpush('AnnualEnterprise:gnqy_socialCode', item)
closeSql(cnx,cursor)
#企业年报定时任务
def AnnualEnterprise_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每年执行一次
scheduler.add_job(AnnualEnterprise, 'cron', second='*/10')
try:
# 定时开始前执行一次
AnnualEnterprise()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
#企业基本信息
def BaseInfoEnterprise():
cnx,cursor = connectSql()
......@@ -245,6 +218,33 @@ def weixin_task():
print('定时采集异常', e)
pass
#企业年报证监会
def AnnualEnterprise():
cnx,cursor = connectSql()
# 获取国内企业
gn_query = "select SocialCode from EnterpriseInfo where Place = '1' and SecuritiesCode is not null"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('=======')
for item in gn_social_list:
r.rpush('AnnualEnterprise:gnqy_socialCode', item)
closeSql(cnx,cursor)
#企业年报定时任务
def AnnualEnterprise_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每年执行一次
scheduler.add_job(AnnualEnterprise, 'cron', second='*/10')
try:
# 定时开始前执行一次
AnnualEnterprise()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
# 企业年报——雪球网
def AnnualEnterpriseXueQ():
cnx,cursor = connectSql()
......@@ -271,6 +271,21 @@ def AnnualEnterpriseXueQ_task():
print('定时采集异常', e)
pass
#企业年报--美国证券交易委员会
def AnnualEnterpriseUS():
cnx,cursor = connectSql()
# 获取美股企业
us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode is not null"
# us_query = "select SocialCode from EnterpriseInfo where Place = '2' and SecuritiesType = '美股' and SecuritiesCode = 'BP' "
#ZZSN22080900000025
cursor.execute(us_query)
us_result = cursor.fetchall()
us_social_list = [item[0] for item in us_result]
print('=======')
for item in us_social_list:
r.rpush('AnnualEnterprise:usqy_socialCode', item)
closeSql(cnx,cursor)
#国外企业基本信息 redis中放入id
def BaseInfoEnterpriseAbroad():
cnx,cursor = connectSql()
......@@ -327,8 +342,8 @@ def FBS():
#省属国有企业 盟市国有企业
def MengZhi():
cnx, cursor = connectSql()
gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where b.type=5 and a.SocialCode=b.SocialCode;"
cnx, cursor = cnn11()
gn_query = "select * from t_0906 a where not exists (select 1 from sys_base_enterprise_executive b where a.xydm =b.social_credit_code)"
# gn_query = "select a.SocialCode from EnterpriseInfo a,EnterpriseType b where b.type=4 and a.SocialCode=b.SocialCode;"
cursor.execute(gn_query)
gn_result = cursor.fetchall()
......@@ -336,7 +351,7 @@ def MengZhi():
gn_social_list = [item[0] for item in gn_result]
for item in gn_social_list:
if not r.exists(item):
r.rpush('BaseInfoEnterpriseMz:gnqy_socialCode', item)
# r.rpush('BaseInfoEnterpriseMz:gnqy_socialCode', item)
r.rpush('CorPersonEnterprise:gnqy_socialCode', item)
closeSql(cnx, cursor)
......@@ -383,7 +398,8 @@ if __name__ == "__main__":
# NewsEnterprise()
# BaseInfoEnterprise()
# FBS()
MengZhi()
# MengZhi()
AnnualEnterpriseUS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# NoticeEnterprise()
......
import json
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
url = 'http://www.imlooker.com/v4/company/142660.html'
req = requests.get(url)
soup = BeautifulSoup(req.content,'html.parser')
# print(soup)
info = soup.find('div',id='mydiv1')
company_name = '台玻集团'
WebSite =info.find('div').text.split('官网:')[1]
#简介
info.find('div').decompose()
briefInfo = info.text.strip()
table_info = soup.find_all('div',class_='com_340')[1]
# print(table_info)
td_list = table_info.find_all('td')
# print(td_list)
incorporationDate = td_list[1].text
businessRange = td_list[3].text
scale = td_list[11].text
address = td_list[13].text
aa_dict = {
'qccId': '', # 企查查企业id
'name': company_name, # 企业名称
'shortName': '', # 企业简称
'socialCreditCode': '', # 统一社会信用代码
'legalPerson': '', # 法定代表人
'officialPhone': '', # 电话
'officialUrl': WebSite, # 官网
'officialEmail': '', # 邮箱
'briefInfo': briefInfo, # 简介
'registerStatus': '', # 登记状态
'incorporationDate': incorporationDate, # 成立日期
'capital': '', # 注册资本
'paidCapital': '', # 实缴资本
'approvalDate': '', # 核准日期
'organizationCode': '', # 组织机构代码
'registerNo': '', # 工商注册号
'taxpayerNo': '', # 纳税人识别号
'type': '', # 企业类型
'businessStartDate': '', # 营业期限自
'businessEndDate': '', # 营业期限至
'taxpayerQualification': '', # 纳税人资质
'industry': '', # 所属行业
'region': '',
'province': '台湾省', # 所属省
'city': '台北市', # 所属市
'county': '松山区', # 所属县
'registerDepartment': '', # 登记机关
'scale': scale, # 人员规模
'insured': '', # 参保人数
'beforeName': '', # 曾用名
'englishName': 'Taiwan Glass Group', # 英文名
'importExportEnterpriseCode': '', # 进出口企业代码
'address': address, # 地址
'businessRange': businessRange, # 经营范围
'status': 0, # 状态
}
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("regionInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
"""
知网论文采集 模拟点击 封ip
"""
import pymysql
import requests,re,time,random
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx = baseCore.cnx
cursor = baseCore.cursor
def get_proxy():
sql = "select proxy from clb_proxy"
cursor.execute(sql)
proxy_lists = cursor.fetchall()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
proxy_list.append(proxy)
return proxy_list
# 设置浏览器启动参数
capabilities = webdriver.DesiredCapabilities.CHROME.copy()
get_proxy().proxy.add_to_capabilities(capabilities)
info = pd.read_excel('全球创新指标数据(1).xlsx')
enterprise_name_list = []
industry_list = []
for i in range(info.shape[0]):
# print(info['contrast_name'][i])
if info['contrast_name'][i]=='发表论文数量' :
enterprise_name = info['enterprise_name'][i]
if enterprise_name == '中国石油天然气股份有限公司':
pass
else:
continue
industry = info['industry'][i]
industry_list.append(industry)
enterprise_name_list.append(enterprise_name)
df_all = pd.DataFrame({'公司名称':enterprise_name_list,
'行业':industry_list})
df_all['文章发表数'] = ''
# for year in range(2022,1989,-1):
# df_all[f'{year}'] = ''
# print(df_all)
list_one_info = []
def get_num(com_name,com_industry):
url = f'https://kns.cnki.net/kns8/DefaultResult/Index?dbcode=CFLQ&kw={com_name}&korder=AF'
browser.get(url) # 跳到指定页面
time.sleep(2)
btn = browser.find_element(By.XPATH, '/html/body/div[3]/div[1]/div/div/a/span')
btn.click()
print('点击1成功')
time.sleep(3)
btn2 = browser.find_element(By.XPATH,'//*[@id="divGroup"]/dl[3]/dt')
btn2.click()
print("点击2成功")
time.sleep(1)
page_source = browser.page_source # 获取页面信息
soup = BeautifulSoup(page_source, 'html.parser')
num_all = soup.find_all('div', {'class': 'resultlist'})[3].find('ul').find_all('li')
if num_all:
for li in num_all:
year = li.find('a').text
num = li.find('span').text.split('(')[1].split(')')[0]
dic_json = {
'enterprise_name':com_name,
'year':year,
'num':num,
'source':'国内外企业发布文章数量来源:中国知网',
'industry':com_industry
}
list_one_info.append(dic_json)
else:
dic_json = {
'enterprise_name': com_name,
'year': '',
'num': '',
'source': '国内外企业发布文章数量来源:中国知网',
'industry': com_industry
}
list_one_info.append(dic_json)
return list_one_info
chromedriver = 'D:\Chrome\chromedriver.exe'
browser = webdriver.Chrome(chromedriver)
for i in range(0,len(df_all)):
com_name = df_all['公司名称'][i]
com_industry=df_all['行业'][i]
try:
list_one_info = get_num(com_name,com_industry)
except:
continue
print(list_one_info)
df_info = pd.DataFrame(list_one_info)
df_info.to_excel('年份-论文发表数量.xlsx',index=False)
import requests, re, time, pymysql
++ /dev/null
import requests, re, time, pymysql
from bs4 import BeautifulSoup as bs
from fdfs_client.client import get_tracker_conf, Fdfs_client
from base import BaseCore
baseCore = BaseCore.BaseCore()
requests.adapters.DEFAULT_RETRIES = 3
log = baseCore.getLogger()
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
cursor = cnx.cursor()
tracker_conf = get_tracker_conf('./client.conf')
client = Fdfs_client(tracker_conf)
taskType = '企业年报/雪球网'
def tableUpdate(year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by, create_time):
sel_sql = '''select item_id from clb_sys_attachment where item_id = %s and year = %s'''
cursor.execute(sel_sql, (item_id, year))
selects = cursor.fetchone()
if selects:
print(f'{com_name},{year}已存在')
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
create_by,
create_time)
cursor.execute(Upsql, values) # 插入
cnx.commit() # 提交
print("更新完成:{}".format(Upsql))
def getContent(social_code, com_name, code,start_time):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
}
comp = re.compile('-?[1-9]\d*')
num = 1
ip = baseCore.get_proxy()
url_1 = f'https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/{code}/page_type/ndbg.phtml'
res_1 = requests.get(url_1, proxies=ip)
soup = bs(res_1.content, 'html.parser',from_encoding='gb2312')
# 获取年度报告列表
try:
list_all = soup.find('div', {'class': 'datelist'}).find_all('a')
except:
log.info(f'{social_code}.........年度报告列表为空')
exception = '年度报告列表为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
return
# 获取年报详细信息
for href in list_all:
ip = baseCore.get_proxy()
year_url = 'https://vip.stock.finance.sina.com.cn' + href.get('href')
year_name = href.text
res_2 = requests.get(year_url, proxies=ip)
soup_2 = bs(res_2.content, 'html.parser',from_encoding='gb2312')
try:
pdf_url = soup_2.find('th', {'style': 'text-align:center'}).find('a').get('href')
except:
log.error(f'{social_code}....{year_url}....无下载链接')
exception = '无下载链接'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
for i in range(0, 3):
try:
resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
break
except:
time.sleep(3)
continue
try:
year = comp.findall(year_name)[0]
except:
continue
name_pdf = f"{com_name}:{year}年年报.pdf".replace('*', '')
result = ''
for i in range(0, 3):
try:
result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
break
except Exception as e:
log.error(f'{social_code}...年报上传服务器出错:{e}')
time.sleep(3)
continue
if result == '':
exception = '上传服务器失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
type_id = '1'
item_id = social_code
group_name = 'group1'
path = bytes.decode(result['Remote file_id']).replace('group1', '')
full_path = bytes.decode(result['Remote file_id'])
category = 'pdf'
file_size = result['Uploaded size']
order_by = num
status = 1
create_by = 'XueLingKun'
create_time = time_now
try:
tableUpdate(year, name_pdf, type_id, item_id, group_name, path, full_path, category, file_size,
order_by, status, create_by, create_time)
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, '')
except:
exception = '数据库传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, year_url, exception)
def begin():
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '91100000100003962T'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
code = dic_info[3]
com_name = dic_info[4]
if code is None:
exeception = '股票代码为空'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exeception)
continue
getContent(social_code, com_name, code,start_time)
count += 1
runType = 'AnnualReportCount'
baseCore.updateRun(social_code, runType, count)
break
if __name__ == '__main__':
begin()
cursor.close()
cnx.close()
baseCore.close()
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -97,10 +97,10 @@ def spider_annual_report(dict_info,num):
try:
# 标题中有年份,
year = re.findall('\d{4}', year_name)[0]
year = re.findall('\d{4}\s*年', year_name)[0]
if com_name != 'null':
name_pdf = f"{com_name}:{year}年报.pdf".replace('*', '')
name_pdf = f"{com_name}:{year}年报.pdf".replace('*', '')
else:
name_pdf = pdf_name_a + '.pdf'
except:
......@@ -144,6 +144,10 @@ def spider_annual_report(dict_info,num):
# name_pdf = pdf_name_a + '.pdf'
with cnx.cursor() as cursor:
if '年' in year:
year = year.split('年')[0]
else:
pass
sel_sql = '''select item_id,year from clb_sys_attachment where item_id = %s and year = %s and type_id="1" '''
cursor.execute(sel_sql, (social_code, int(year)))
selects = cursor.fetchone()
......@@ -208,7 +212,7 @@ def spider_annual_report(dict_info,num):
'code': '200',
}
print(dic_result)
return True
# return True
except Exception as e:
dic_result = {
'success': 'false',
......@@ -225,6 +229,8 @@ def spider_annual_report(dict_info,num):
# num = num + 1
time.sleep(2)
# browser.quit()
return True
#state1
if __name__ == '__main__':
......@@ -233,8 +239,8 @@ if __name__ == '__main__':
while True:
start_time = time.time()
# 获取企业信息
social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
# social_code = '911100007109288314'
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code = '9133060072360502XQ'
if not social_code:
time.sleep(20)
continue
......@@ -245,7 +251,7 @@ if __name__ == '__main__':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
count = dic_info[15]
count = dic_info[16]
code = dic_info[3]
com_name = dic_info[4]
if code is None:
......
......@@ -15,6 +15,23 @@ def conn11():
return conn,cursor
#企业公告
def shizhiCodeFromSql():
conn,cursor=conn11()
try:
gn_query = "select securities_code from sys_base_enterprise_ipo where category in ('4','5','6') "
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('shizhiCodeFromSql开始将股票代码放入redis=======')
for item in gn_social_list:
r.rpush('NoticeEnterprise:shizhi_code', item)
print('shizhiCodeFromSql将股票代码放入redis结束')
except Exception as e:
log.info("数据查询异常")
finally:
cursor.close()
conn.close()
#企业公告
def yahooCodeFromSql():
conn,cursor=conn11()
try:
......@@ -22,10 +39,10 @@ def yahooCodeFromSql():
cursor.execute(gn_query)
gn_result = cursor.fetchall()
gn_social_list = [item[0] for item in gn_result]
print('=======')
print('yahooCodeFromSql开始将股票代码放入redis=======')
for item in gn_social_list:
r.rpush('NoticeEnterprise:securities_code', item)
print('将股票代码放入redis结束')
print('yahooCodeFromSql将股票代码放入redis结束')
except Exception as e:
log.info("数据查询异常")
finally:
......@@ -38,28 +55,23 @@ def yahooCode_task():
# 每天执行一次
# scheduler.add_job(yahooCodeFromSql, 'cron', hour=0,minute=0)
#3天执行一次
scheduler.add_job(yahooCodeFromSql, 'interval', days=3)
scheduler.add_job(yahooCodeFromSql, 'cron', day='*/3', hour=0, minute=0)
# 每天执行一次
scheduler.add_job(shizhiCodeFromSql, 'cron', hour=10,minute=0)
try:
yahooCodeFromSql() # 定时开始前执行一次
shizhiCodeFromSql() # 定时开始前执行一次
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
if __name__ == "__main__":
start = time.time()
# NoticeEnterprise()
# AnnualEnterpriseIPO()
# AnnualEnterprise()
# BaseInfoEnterpriseAbroad()
# NewsEnterprise_task()
# NewsEnterprise()
# BaseInfoEnterprise()
# FBS()
# NoticeEnterprise_task()
# AnnualEnterprise_task()
# NoticeEnterprise()
yahooCode_task()
# yahooShizhiCode_task()
log.info(f'====={basecore.getNowTime(1)}=====添加数据成功======耗时:{basecore.getTimeCost(start,time.time())}===')
# cnx.close()
# cursor.close()
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -365,7 +365,10 @@ class YahooCaiwu(object):
currency = pq(resp1_table[0]).text()
if 'Currency in' in currency:
result = re.findall(r'(?<=Currency in\s).*', currency)
currency=result[0]+'(千)'
currency=result[0]
if '(' in currency:
currency=currency.split('(')[0]
currency=str(currency).upper()+'(千)'
except Exception as e:
currency=''
return currency
......
......@@ -115,69 +115,155 @@ class CurrencyRate(object):
if __name__ == '__main__':
result_list1 = [
[
'人民币',
'CNY'],
[
'美元',
'USD'],
[
'欧元',
'EUR'],
[
'瑞士法郎',
'CHF'],
[
'加元',
'CAD'],
[
'波兰兹罗提',
'PLN'],
[
'英镑',
'GBP'],
[
'澳元',
'AUD'],
[
'泰铢',
'THB'],
[
'沙特里亚尔',
'SAR'],
[
'巴西里亚伊',
'BRL'],
[
'新土耳其新里拉',
'TRY'],
[
'新台币',
'TWD'],
[
'印度卢比',
'INR'],
[
'墨西哥比索',
'MXN'],
[
'日元',
'JPY'],
[
'瑞典克朗',
'SEK'],
[
'韩元',
'KRW'],
[
'俄罗斯卢布',
'RUB'],
[
'新加坡元',
'SGD'],
[
'港币',
'HKD']]
['人民币','CNY'],
['港元','HKD'],
['台币','TWD'],
['欧元','EUR'],
['美元','USD'],
['英镑','GBP'],
['澳元','AUD'],
['韩元','KRW'],
['日元','JPY'],
['澳元','AUD'],
['阿尔巴尼亚列克','ALL'],
['阿尔及利亚第纳尔','DZD'],
['阿根廷比索','ARS'],
['阿鲁巴岛弗罗林','AWG'],
['澳元','AUD'],
['埃及镑','EGP'],
['埃塞俄比亚比尔','ETB'],
['澳门元','MOP'],
['阿曼里亚尔','OMR'],
['阿联酋迪拉姆','AED'],
['巴哈马元','BSD'],
['巴林第纳尔','BHD'],
['巴巴多斯元','BBD'],
['白俄罗斯卢布','BYR'],
['伯利兹元','BZD'],
['百慕大元','BMD'],
['不丹卢比','BTN'],
['玻利维亚诺','BOB'],
['博茨瓦纳普拉','BWP'],
['巴西里亚伊','BRL'],
['保加利亚列瓦','BGN'],
['布隆迪法郎','BIF'],
['冰岛克朗','ISK'],
['巴基斯坦卢比','PKR'],
['巴拿马巴尔博亚','PAB'],
['巴布亚新几内亚基那','PGK'],
['巴拉圭瓜拉尼','PYG'],
['波兰兹罗提','PLN'],
['朝鲜圆','KPW'],
['多哥非洲共同体法郎','XOF'],
['丹麦克朗','DKK'],
['多米尼加比索','DOP'],
['俄罗斯卢布','RUB'],
['佛得角埃斯库多','CVE'],
['福克兰群岛镑','FKP'],
['斐济元','FJD'],
['菲律宾比索','PHP'],
['港元','HKD'],
['刚果中非共同体法郎','XAF'],
['哥伦比亚比索','COP'],
['哥斯达黎加科朗','CRC'],
['古巴比索','CUP'],
['格林纳达东加勒比元','XCD'],
['冈比亚达拉西','GMD'],
['圭亚那元','GYD'],
['韩元','KRW'],
['海地古德','HTG'],
['洪都拉斯伦皮拉','HNL'],
['哈萨克斯坦腾格','KZT'],
['柬埔寨利尔斯','KHR'],
['加拿大元','CAD'],
['捷克克朗','CZK'],
['吉布提法郎','DJF'],
['几内亚法郎','GNF'],
['科摩罗法郎','KMF'],
['克罗地亚库纳','HRK'],
['肯尼亚先令','KES'],
['科威特第纳尔','KWD'],
['卡塔尔利尔','QAR'],
['老挝基普','LAK'],
['拉脱维亚拉图','LVL'],
['黎巴嫩镑','LBP'],
['莱索托洛提','LSL'],
['利比里亚元','LRD'],
['利比亚第纳尔','LYD'],
['立陶宛里塔斯','LTL'],
['列斯荷兰盾','ANG'],
['罗马尼亚新列伊','RON'],
['卢旺达法郎','RWF'],
['美元','USD'],
['孟加拉塔卡','BDT'],
['马其顿第纳尔','MKD'],
['马拉维克瓦查','MWK'],
['马来西亚林吉特','MYR'],
['马尔代夫卢非亚','MVR'],
['毛里塔尼亚乌吉亚','MRO'],
['毛里求斯卢比','MUR'],
['墨西哥比索','MXN'],
['摩尔多瓦列伊','MDL'],
['蒙古图格里克','MNT'],
['摩洛哥道拉姆','MAD'],
['缅甸元','MMK'],
['秘鲁索尔','PEN'],
['纳米比亚元','NAD'],
['尼泊尔卢比','NPR'],
['尼加拉瓜科多巴','NIO'],
['尼日利亚奈拉','NGN'],
['挪威克朗','NOK'],
['南非兰特','ZAR'],
['欧元','EUR'],
['日元','JPY'],
['瑞典克朗','SEK'],
['瑞士法郎','CHF'],
['萨尔瓦多科朗','SVC'],
['萨摩亚塔拉','WST'],
['圣多美多布拉','STD'],
['沙特阿拉伯里亚尔','SAR'],
['塞舌尔法郎','SCR'],
['塞拉利昂利昂','SLL'],
['所罗门群岛元','SBD'],
['索马里先令','SOS'],
['斯里兰卡卢比','LKR'],
['圣赫勒拿群岛磅','SHP'],
['斯威士兰里兰吉尼','SZL'],
['台币','TWD'],
['土耳其新里拉','TRY'],
['太平洋法郎','XPF'],
['坦桑尼亚先令','TZS'],
['泰国铢','THB'],
['汤加潘加','TOP'],
['特立尼达和多巴哥元','TTD'],
['突尼斯第纳尔','TND'],
['文莱元','BND'],
['危地马拉格查尔','GTQ'],
['乌克兰赫夫米','UAH'],
['乌拉圭新比索','UYU'],
['瓦努阿图瓦图','VUV'],
['越南盾','VND'],
['匈牙利福林','HUF'],
['新西兰元','NZD'],
['新加坡元','SGD'],
['叙利亚镑','SYP'],
['英镑','GBP'],
['印度卢比','INR'],
['印度尼西亚卢比(盾)','IDR'],
['伊朗里亚尔','IRR'],
['伊拉克第纳尔','IQD'],
['以色列镑','ILS'],
['牙买加元','JMD'],
['约旦第纳尔','JOD'],
['也门里亚尔','YER'],
['智利比索','CLP'],
['直布罗陀镑','GIP'],
['铜价盎司','XCP'],
['金价盎司','XAU'],
['钯价盎司','XPD'],
['铂价盎司','XPT'],
['银价盎司','XAG']
]
result_list2 = [
'USD',
'CNY']
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -50,11 +50,15 @@ class Shizhi(object):
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def getCodeFromRedis(self):
securitiescode=self.r.lpop('NoticeEnterprise:shizhi_code')
securitiescode = securitiescode.decode('utf-8')
return securitiescode
def getmarketCap(self):
def getmarketCap(self,securitiescode):
conn,cursor=self.conn11()
try:
sql1 = """select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where category in ('4','5','6') """ # and stock_code = "SYNH"
sql1 = f"select social_credit_code,securities_code,securities_short_name from sys_base_enterprise_ipo where securities_code='{securitiescode}' " # and stock_code = "SYNH"
cursor.execute(sql1)
result_data = cursor.fetchall()
except Exception as e:
......@@ -75,21 +79,25 @@ class Shizhi(object):
url = f'https://finance.yahoo.com/quote/{stock2}?p={stock2}'
try:
self.logger.info(f'正在采集:{url}')
# 设置页面加载超时时间为10秒
self.driver.set_page_load_timeout(60)
self.driver.get(url)
# 等待页面加载完成
wait = WebDriverWait(self.driver, 300)
wait = WebDriverWait(self.driver, 60)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(5)
doc_resp = pq(self.driver.page_source)
unit=doc_resp('div[id="quote-header-info"]>div:nth-child(2)>div:nth-child(1)>div:nth-child(2)>span')
currency = unit.text().split("Currency in ")[1]
if '(' in currency:
currency=currency.split('(')[0]
market_cap=doc_resp('td[data-test="MARKET_CAP-value"]')
marketcap=market_cap.text()
if marketcap and marketcap!='N/A':
# 获取当前时间
current_time = datetime.datetime.now()
currentdate = current_time.strftime("%Y-%m-%d")
print(f'信用代码:{social_credit_code} 股票代码:{stock} 币种:{currency} 市值:{marketcap} 日期:{currentdate}')
self.logger.info(f'信用代码:{social_credit_code} 股票代码:{stock} 币种:{currency} 市值:{marketcap} 日期:{currentdate}')
# market_url = f'http://192.168.1.39:8088/sync/marketValue'
market_url = f'http://114.115.236.206:8088/sync/marketValue'
param= {
......@@ -103,26 +111,33 @@ class Shizhi(object):
resp = requests.post(market_url,json=param)
# 检查响应状态码
if resp.status_code == 200:
print("请求成功")
self.logger.info("调用接口请求成功")
# 打印响应内容
print(resp.content)
self.logger.info(resp.content)
else:
print("请求失败")
self.logger.info("调用接口请求失败")
except:
with open('雅虎财经-财务数据_发送错误ID.txt', 'a', encoding='utf8')as f:
f.write(stock + '\n')
self.logger.info("调用接口请求失败")
except Exception as e:
self.driver.close()
self.logger.info('请求异常!重新打开浏览器')
self.driver.quit()
self.driver=self.get_webdriver()
print(e)
except Exception as e:
print(e)
self.driver.close()
self.driver.quit()
self.driver=self.get_webdriver()
self.logger.info(f'{securitiescode}股票的市值采集结束')
if __name__ == '__main__':
shizhi=Shizhi()
shizhi.getmarketCap()
\ No newline at end of file
shizhi=Shizhi()
# shizhi.getmarketCap()
while True:
securitiescode=''
try:
securitiescode=shizhi.getCodeFromRedis()
shizhi.getmarketCap(securitiescode)
except Exception as e:
shizhi.logger.info("redis为空等待5分钟")
if securitiescode:
shizhi.r.rpush('NoticeEnterprise:shizhi_code',securitiescode)
else:
time.sleep(300)
\ No newline at end of file
"""
"""
......@@ -458,83 +458,93 @@ def getReportTime():
def job(taskType):
# 将上市企业库中的全部A股代码存入list
# 需要提供股票代码、企业信用代码
while True:
#从redis中获取企业信用代码
social_code = baseCore.redicPullData('FinanceFromEast:eastfinance_socialCode')
# social_code = '91100000100003962T'
# 判断 如果Redis中已经没有数据,则等待
if social_code == None:
time.sleep(20)
continue
sql_sel = f'''select securities_code,exchange from sys_base_enterprise_ipo where category = '1' and social_credit_code='{social_code}' '''
cursor.execute(sql_sel)
row = cursor.fetchone()
try:
securities_code = row[0]
pass
except:
log.info(f'======{social_code}没有股票代码======')
continue
exchange = row[1]
# for code in list_code:
# social_code = rows[0]
# exchange = rows[2]
# if code==rows[1]:
# securities_code = code
# else:
# continue
if exchange == 1:
com_code = 'bj' + securities_code
if exchange == 2:
com_code = 'sh' + securities_code
if exchange == 3:
com_code = 'sz' + securities_code
# if com_code=='sz002163':
list_date = getReportTime()
delist = [] # 记录该企业所有无数据的报告期
date_list = [] # 记录该企业所有数据的报告期
start_time = time.time()
# 分别对每个报告期进行采集
for info_date in list_date:
delist_all = []
info_date_list = []
dic_info = get_info(social_code, com_code, info_date, delist_all, info_date_list,taskType)
# print(dic_info)
# 将采集后的报告期存入redis
if len(dic_info)!=0:
# 调凯歌接口存储数据
data = json.dumps(dic_info)
# print(data)
url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
for nnn in range(0, 3):
try:
res_baocun = requests.post(url_baocun, data=data)
break
except:
time.sleep(1)
print(res_baocun.text)
def job(taskType):
# 将上市企业库中的全部A股代码存入list
# 需要提供股票代码、企业信用代码
while True:
# 从redis中获取企业信用代码
social_code = baseCore.redicPullData('FinanceFromEast:finance_socialCode')
# social_code = '91420300178856869P'
# 判断 如果Redis中已经没有数据,则等待
log.info(f'==========正在采集{social_code}============')
if social_code == None:
time.sleep(20)
continue
for nnn in range(0, 3):
try:
add_date(com_code, date_list)
break
except:
time.sleep(1)
if len(info_date_list) != 0:
for date in info_date_list:
date_list.append(date)
print(date_list)
# date_list = str(date_list)
end_time = time.time()
log.info(f'===={com_code}====该企业耗时{end_time-start_time}===')
sql_sel = f'''select securities_code,exchange from sys_base_enterprise_ipo where category = '1' and social_credit_code='{social_code}' '''
cursor.execute(sql_sel)
row = cursor.fetchone()
try:
securities_code = row[0]
pass
except:
log.info(f'======{social_code}没有股票代码======')
continue
exchange = row[1]
# for code in list_code:
# social_code = rows[0]
# exchange = rows[2]
# if code==rows[1]:
# securities_code = code
# else:
# continue
if exchange == 1:
com_code = 'bj' + securities_code
if exchange == 2:
com_code = 'sh' + securities_code
if exchange == 3:
com_code = 'sz' + securities_code
# if com_code=='sz002163':
list_date = getReportTime()
delist = [] # 记录该企业所有无数据的报告期
date_list = [] # 记录该企业所有数据的报告期
start_time = time.time()
# 分别对每个报告期进行采集
for info_date in list_date:
delist_all = []
info_date_list = []
dic_info = get_info(social_code, com_code, info_date, delist_all, info_date_list, taskType)
# print(dic_info)
# 将采集后的报告期存入redis
if len(info_date_list) != 0:
for date in info_date_list:
date_list.append(date)
if len(dic_info) != 0:
# 调凯歌接口存储数据
data = json.dumps(dic_info)
# print(data)
url_baocun = 'http://114.115.236.206:8088/sync/finance/df'
for nnn in range(0, 3):
try:
res_baocun = requests.post(url_baocun, data=data)
break
except:
time.sleep(1)
print(res_baocun.text)
for nnn in range(0, 3):
try:
add_date(com_code, date_list)
break
except:
time.sleep(1)
# if len(info_date_list) != 0:
# for date in info_date_list:
# date_list.append(date)
log.info(date_list)
# date_list = str(date_list)
end_time = time.time()
log.info(f'===={com_code}====该企业耗时{end_time - start_time}===')
cnx.close()
cursor.close()
baseCore.close()
cnx.close()
cursor.close()
baseCore.close()
if __name__=='__main__':
task_type = '财务数据/东方财富网/福布斯'
task_type = '财务数据/东方财富网'
job(task_type)
......
import datetime
import time
import redis
import requests
import urllib3
from pyquery import PyQuery as pq
import json
from kafka import KafkaProducer
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#务院政策问答平台最新发布信息采集
def reqHtml(url,data,header):
try:
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
json_data=json.dumps(data)
response = requests.post(url,data=json_data,headers=header,verify=False,timeout=10)
print(response.status_code)
html=response.text
except Exception as e:
html=''
return html
def page_list():
header = {
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'72',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/index/index',
'x-yss-city-code':'4400',
'Accept':'*/*',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh'
}
url='https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicyList'
for i in range(1,445):
print(f'采集第{i}页数据')
k=i
da='{"filterType":"","departmentid":"","keyword":"","page_size":15,"page":[k]}'
data=da.replace('[k]',str(k))
try:
data=json.loads(data)
lhtml=reqHtml(url,data,header)
hjson=json.loads(lhtml)
data=hjson['data']['list']
except Exception as e:
print(e)
time.sleep(60)
continue
for ss in data:
id=ss['id']
durl=f'https://xcx.www.gov.cn/ebus/gwymp/api/r/faqlib/GetPolicy'
sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
try:
flag=r.sismember('IN-20230829-0146',sourceAddress)
if flag:
print('信息已采集入库过')
continue
except Exception as e:
continue
ss['url']=durl
ss['sourceAddress']=sourceAddress
detailpaser(ss)
# time.sleep(5)
def detailpaser(dmsg):
hh={
'Host':'xcx.www.gov.cn',
'Connection':'keep-alive',
'Content-Length':'14',
'x-tif-openid':'ojyj-41lGcemgsREMHBh1ac7iZUw',
'x-tif-did':'pb5XUGL1Zm',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Windows WindowsWechat/WMPF XWEB/8379',
'x-tif-sid':'e1436792814f1c6845af4d84cbc4ad9957',
'Content-Type':'application/json',
'xweb_xhr':'1',
'dgd-pre-release':'0',
'x-yss-page':'publicService/pages/policyQALibrary/detail/detail',
'x-yss-city-code':'4400',
'Accept':'*/*',
'Sec-Fetch-Site':'cross-site',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Dest':'empty',
'Referer':'https://servicewechat.com/wxbebb3cdd9b331046/713/page-frame.html',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh'
}
try:
durl=dmsg['url']
id=str(dmsg['id'])
data={"id":id}
json_data=json.dumps(data)
response = requests.post(durl,data=json_data,headers=hh,verify=False,timeout=10)
dhtml=response.text
dd=json.loads(dhtml)
sendTokafka(dd)
except Exception as e:
print(e)
print(dhtml)
def sendTokafka(ddata):
dd=ddata['data']
title=dd['title']
id=dd['id']
content=dd['content']
contentWithTag=dd['content']
publishTime=dd['publishTime']
time_format='%Y年%m月%d日'
publishDate=str(datetime.datetime.strptime(publishTime, time_format))
origin=dd['departmentName']
sourceAddress=f'https://bmfw.www.gov.cn/zcdwpt/index.html#/detail?id={id}'
sid='1696404919115825153'
info_code='IN-20230829-0146'
aa_dict = {
'content': content,
'contentWithTag': contentWithTag,
'id': '',
'sid': sid,
'origin': origin,
'publishDate': publishDate,
'sourceAddress': sourceAddress,
'title': title,
'source': 'python定制采集',
'type': ''
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code,sourceAddress)
print('发送kafka成功!')
except Exception as e:
print(e)
finally:
producer.close()
# r.close()
if __name__ == '__main__':
r = redis.Redis(host='114.115.236.206', port=6379,password='clbzzsn', db=5)
page_list()
from datetime import datetime
from urllib.parse import urljoin
import redis
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
import json
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def reqHtml(url):
try:
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
header={
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'__jsluid_s=d344baee4a1e027b745a48855ff6539d',
'Host':'www.miit.gov.cn',
'Pragma':'no-cache',
'Referer':'https://www.miit.gov.cn/zwgk/zcjd/index.html',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
response = requests.get(url,headers=header,verify=False,timeout=10)
code=response.status_code
print(f'url:{url} 信息的采集状态码{code}')
html=response.text
except Exception as e:
html=''
return html
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def page_list():
for i in range(1,27):
print(f"采集到第{i}页!!")
aurl='https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?webId=8d828e408d90447786ddbe128d495e9e&pageId=1b56e5adc362428299dfc3eb444fe23a&parseType=buildstatic&pageType=column&tagId=右侧内容&tplSetId=209741b2109044b5b7695700b2bec37e&paramJson={"pageNo":[i],"pageSize":"24"}'
url=aurl.replace('[i]',str(i))
html=reqHtml(url)
text=json.loads(html)
html=text['data']['html']
soup=paserUrl(html,'https://www.miit.gov.cn/zwgk/zcjd/index.html')
html=str(soup.prettify())
doc=pq(html)
ll=doc('li[class="cf"]')
for list in ll:
ldoc=pq(list)
title=ldoc('a').text()
url=ldoc('a').attr('href')
# url='https://www.miit.gov.cn'+url
try:
flag=r.sismember('IN-20230829-0199',url)
if flag:
print(f'信息已采集入库{title}')
continue
except Exception as e:
continue
publishdate=ldoc('span').text()
dmsg={
'title':title,
'url':url,
'publishdate':publishdate
}
print(f'列表信息: title:{title} url:{url} time:{publishdate}')
detail(dmsg)
def detail(dmsg):
try:
durl=dmsg['url']
title=dmsg['title']
publishTime=dmsg['publishdate']
html=reqHtml(durl)
soup=paserUrl(html,durl)
con=soup.select('div[id="con_con"]')[0]
contentWithTag=con.prettify()
content=con.text
if content:
pass
else:
content=contentWithTag
ddata={
'title':title,
'publishTime':publishTime,
'sourceAddress':durl,
'content':content,
'contentWithTag':contentWithTag,
'origin':'中华人民共和国工业和信息化部-政务公开-政策解读',
}
sendTokafka(ddata)
except Exception as e:
print(e)
def sendTokafka(ddata):
title=ddata['title']
content=ddata['content']
contentWithTag=ddata['contentWithTag']
publishTime=ddata['publishTime']
sourceAddress=ddata['sourceAddress']
origin=ddata['origin']
time_format='%Y-%m-%d'
publishDate=str(datetime.strptime(publishTime, time_format))
sid='1696452056436424706'
info_code='IN-20230829-0199'
aa_dict = {
'content': content,
'contentWithTag': contentWithTag,
'id': '',
'sid': sid,
'origin': origin,
'publishDate': publishDate,
'sourceAddress': sourceAddress,
'title': title,
'source': 'python定制采集',
'type': ''
}
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo", json.dumps(aa_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code,sourceAddress)
print('发送kafka结束')
except Exception as e:
print(e)
print('发送kafka异常!')
finally:
producer.close()
# r.close()
if __name__ == '__main__':
r = redis.Redis(host='114.115.236.206', port=6379,password='clbzzsn', db=5)
page_list()
print('采集结束===')
\ No newline at end of file
主要采集了两个网站
1、中国政府网-最新发布 数据是从微信小程序中获取的跟网站的数据是一致的 http://bmfw.www.gov.cn/zcdwpt/index.html#/
2、中华人民共和国工业和信息化部-政务公开-政策解读 http://bmfw.www.gov.cn/zcdwpt/index.html#/
主要是获取对应的资讯信息
"""
"""
......@@ -135,7 +135,7 @@ def getUrl(code, url_parms, Catagory2_parms):
def InsterInto(short_name, social_code, name_pdf, pub_time, pdf_url, report_type):
inster = False
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s'''
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
cursor.execute(sel_sql, (social_code, pdf_url))
selects = cursor.fetchone()
if selects:
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -13,7 +13,7 @@ baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
headers = {
'Cookie':'HWWAFSESID=0e10b77869899be8365; HWWAFSESTIME=1688781923708; csrfToken=VeTF4UIZKJ0q6yWmgfC_FLqv; TYCID=e7cec7501d3311eea9dcb9fb7af79aad; ssuid=3142278034; sajssdk_2015_cross_new_user=1; bannerFlag=true; _ga=GA1.2.1006597844.1688781929; _gid=GA1.2.146077413.1688781929; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1688781929; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2217103123002%22}; tyc-user-info-save-time=1688781977329; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzEwMzEyMzAwMiIsImlhdCI6MTY4ODc4MTk3NiwiZXhwIjoxNjkxMzczOTc2fQ.Luw0DCFul8WxRNOM8X5-NCmy_z3BwJC5JBvofWqWkSQOleJ6zJU0SRbqwAobPfOfVyGFDUBqmxxWd4YKCeCWeQ; tyc-user-phone=%255B%252217103123002%2522%255D; searchSessionId=1688778331.16177575; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22302953956%22%2C%22first_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg5MzMzZjM4Y2I5NDctMGZiOWIyNTI3NDJhNmMtMjYwMzFkNTEtOTIxNjAwLTE4OTMzM2YzOGNkY2RkIiwiJGlkZW50aXR5X2xvZ2luX2lkIjoiMzAyOTUzOTU2In0%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22302953956%22%7D%2C%22%24device_id%22%3A%22189333f38cb947-0fb9b252742a6c-26031d51-921600-189333f38cdcdd%22%7D; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1688781980',
'Cookie':'HWWAFSESID=b6312a4594bea18413c; HWWAFSESTIME=1686818921445; csrfToken=e7sNDKWelJwlcjnm6Rlny887; TYCID=6ff6bc600b5911ee89d35bf79a73a3b1; bannerFlag=true; ssuid=1534238432; refresh_page=0; _ga=GA1.2.1790752229.1688467828; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22307016917%22%2C%22first_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTg4YmUzZTMzN2U0YmYtMGQ4NTcxNmQzNjZlNDQtMjYwMzFkNTEtMTA0OTA4OC0xODhiZTNlMzM3ZjE5ZSIsIiRpZGVudGl0eV9sb2dpbl9pZCI6IjMwNzAxNjkxNyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22307016917%22%7D%2C%22%24device_id%22%3A%22188be3e337e4bf-0d85716d366e44-26031d51-1049088-188be3e337f19e%22%7D; jsid=SEO-BAIDU-ALL-SY-000001; bdHomeCount=7; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1693986307; tyc-user-info=%7B%22state%22%3A%220%22%2C%22vipManager%22%3A%220%22%2C%22mobile%22%3A%2213592481839%22%7D; tyc-user-info-save-time=1693986377592; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzU5MjQ4MTgzOSIsImlhdCI6MTY5Mzk4NjM3NywiZXhwIjoxNjk2NTc4Mzc3fQ.xeK54nMtB5wt7ipdOjhrzdplT1azvezrTuoD1b8i3OguqMB97ZOR1pFbRsP7vsKRdZ3Fsf5Y5ZqlmRKAVHGraA; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1693986412',
# 'Cookie': 'TYCID=82cbe530204b11ed9f23298cecec1c60; ssuid=3927938144; _ga=GA1.2.1842488970.1670638075; jsid=SEO-BAIDU-ALL-SY-000001; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2215565837784%22}; tyc-user-info-save-time=1678953978429; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTU2NTgzNzc4NCIsImlhdCI6MTY3ODk1Mzk3OCwiZXhwIjoxNjgxNTQ1OTc4fQ.wsNxLWMkZVrtOEvo_CCDPD38R7F23c5yk7dFAdHkwFPkZhEEvmiv0nlt7UD0ZWfo3t8aYxc4qvu4ueEgMubJ5g; tyc-user-phone=%255B%252215565837784%2522%255D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22284710084%22%2C%22first_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfbG9naW5faWQiOiIyODQ3MTAwODQiLCIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgyYjljYTU4NWVhZC0wODk1OThjMWQ3Zjc5MjgtMjYwMjFkNTEtMTMyNzEwNC0xODJiOWNhNTg1ZjdmMSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22284710084%22%7D%2C%22%24device_id%22%3A%22182b9ca585ead-089598c1d7f7928-26021d51-1327104-182b9ca585f7f1%22%7D; HWWAFSESID=fa776898fa88a6520ea; HWWAFSESTIME=1679899464128; csrfToken=m3cB6mHsznwIuppkT-S8oYc6; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1679016180,1679471093,1679732923,1679899468; bdHomeCount=28; bannerFlag=true; show_activity_id_92=92; searchSessionId=1679899783.48494979; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1679899783',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
......@@ -27,7 +27,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91610000220568570K'
# social_code = '91110108778635402E'
if social_code == None:
time.sleep(20)
continue
......
......@@ -27,7 +27,7 @@ cursor= cnx.cursor()
#根据信用代码获取天眼查id 企业名字等信息
def getTycIdByXYDM(xydm):
retData={'state':False,'tycData':None}
retData={'state':False,'tycData':None,'reput':True}
url=f"https://capi.tianyancha.com/cloud-tempest/search/suggest/v3?_={baseCore.getNowTime(3)}"
ip = baseCore.get_proxy()
paramJsonData = {'keyword':xydm}
......@@ -37,19 +37,25 @@ def getTycIdByXYDM(xydm):
response = requests.post(url,json=paramJsonData,headers=headers,verify=False, proxies=ip)
time.sleep(random.randint(3, 5))
retJsonData =json.loads(response.content.decode('utf-8'))
if retJsonData['data'] and retJsonData['state']== 'ok':
pass
else:
log.error(f"---{xydm}-未查询到该企业---")
retData['reput'] = False
return retData
matchType=retJsonData['data'][0]['matchType']
if matchType=='信用代码匹配':
retData['state'] = True
retData['tycData'] = retJsonData['data'][0]
response.close()
return retData['tycData']
return retData
else:
log.error(f"{xydm}------{retJsonData}")
response.close()
return retData['tycData']
return retData
except:
log.error(f"---{xydm}--天眼查token失效---")
return retData['tycData']
return retData
# 更新天眼查企业基本信息
......
......@@ -282,19 +282,21 @@ def doJob():
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(xydm)
if retData:
if retData['tycData'] and retData['reput']:
tycid = retData['id']
# todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
elif not retData['tycData'] and retData['reput']:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('NewsEnterprise:gnqy_socialCode', social_code)
continue
elif not retData['reput'] and not retData['tycData']:
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
......
# -*- coding: utf-8 -*-
import datetime
import time
import pymysql
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from pyquery import PyQuery as pq
from openpyxl import Workbook
import pandas as pd
class WanfangSpider(object):
def __init__(self):
pass
def req(self,url):
header={
"accept":"*/*",
"connection":"Keep-Alive",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
res = requests.get(url,headers=header)
if res.status_code==200:
text=res.text
print('请求成功!')
else:
text=''
print('请求失败!')
return text
# 将html中的相对地址转换成绝对地址
def paserUrl(self,html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
def pageList(self,start,end):
listmsg=[]
# for num in range(1,1321):
for num in range(start,end):
url=f'https://kms.wanfangdata.com.cn/IndustryYJ/Search/Cecdb?q=%E5%86%B6%E9%87%91%2B%E5%86%B6%E7%82%BC&PageNumber={num}'
html=self.req(url)
soup=self.paserUrl(html,url)
text=str(soup.prettify())
doc=pq(text)
liTag=doc('li[class="rt-wrap"]')
# print(liTag)
for li in liTag:
lidoc=pq(li)
title=lidoc('a[class="title"]').text()
turl=lidoc('a[class="title"]').attr('href')
msg={
'title':title,
'turl':turl
}
print(f'title:{title} url:{url}')
listmsg.append(msg)
return listmsg
def detailMsg(self,msg):
detailList=[]
turl = msg['turl']
title = msg['title']
html=self.req(turl)
soup=self.paserUrl(html,turl)
dtext=str(soup.prettify())
ddoc=pq(dtext)
a1=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(1)').text().replace(":","")
institutionType=ddoc('table[class="detail-md"]>tr:nth-child(2)>td:nth-child(2)').text()
a2=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(1)').text().replace(":","")
formerName=ddoc('table[class="detail-md"]>tr:nth-child(3)>td:nth-child(2)').text()
a3=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(1)').text().replace(":","")
leader=ddoc('table[class="detail-md"]>tr:nth-child(4)>td:nth-child(2)').text()
a4=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(1)').text().replace(":","")
establishmentDate=ddoc('table[class="detail-md"]>tr:nth-child(5)>td:nth-child(2)').text()
a5=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(1)').text().replace(":","")
introduction=ddoc('table[class="detail-md"]>tr:nth-child(6)>td:nth-child(2)').text()
a6=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(1)').text().replace(":","")
classification=ddoc('table[class="detail-md"]>tr:nth-child(7)>td:nth-child(2)').text()
a7=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(1)').text().replace(":","")
keywords=ddoc('table[class="detail-md"]>tr:nth-child(8)>td:nth-child(2)').text()
a8=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(1)').text().replace(":","")
researchEquipment=ddoc('table[class="detail-md"]>tr:nth-child(9)>td:nth-child(2)').text()
a9=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(1)').text().replace(":","")
researchAreas=ddoc('table[class="detail-md"]>tr:nth-child(10)>td:nth-child(2)').text()
a10=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(1)').text().replace(":","")
awards=ddoc('table[class="detail-md"]>tr:nth-child(11)>td:nth-child(2)').text()
a11=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(1)').text().replace(":","")
internalDepartments=ddoc('table[class="detail-md"]>tr:nth-child(12)>td:nth-child(2)').text()
a12=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(1)').text().replace(":","")
subsidiaryInstitutions=ddoc('table[class="detail-md"]>tr:nth-child(13)>td:nth-child(2)').text()
a13=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(1)').text().replace(":","")
productInformation=ddoc('table[class="detail-md"]>tr:nth-child(14)>td:nth-child(2)').text()
a14=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(1)').text().replace(":","")
publicationJournals=ddoc('table[class="detail-md"]>tr:nth-child(15)>td:nth-child(2)').text()
a15=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(1)').text().replace(":","")
mailingAddress=ddoc('table[class="detail-md"]>tr:nth-child(16)>td:nth-child(2)').text()
a16=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(1)').text().replace(":","")
tel=ddoc('table[class="detail-md"]>tr:nth-child(17)>td:nth-child(2)').text()
a17=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(1)').text().replace(":","")
faxNumber=ddoc('table[class="detail-md"]>tr:nth-child(18)>td:nth-child(2)').text()
a18=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(1)').text().replace(":","")
email=ddoc('table[class="detail-md"]>tr:nth-child(19)>td:nth-child(2)').text()
a19=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(1)').text().replace(":","")
website=ddoc('table[class="detail-md"]>tr:nth-child(20)>td:nth-child(2)').text()
a20=ddoc('table[class="detail-md"]>tr:nth-child(21)>td:nth-child(1)').text().replace(":","")
web=ddoc('table[class="detail-md"]>tr:nth-child(21)>td:nth-child(2)').text()
detailmsg={
'title':title,
'turl':turl,
a1:institutionType,
a2:formerName,
a3:leader,
a4:establishmentDate,
a5:introduction,
a6:classification,
a7:keywords,
a8:researchEquipment,
a9:researchAreas,
a10:awards,
a11:internalDepartments,
a12:subsidiaryInstitutions,
a13:productInformation,
a14:publicationJournals,
a15:mailingAddress,
a16:tel,
a17:faxNumber,
a18:email,
a19:website,
a20:web
}
detailList.append(detailmsg)
self.writerToExcel(detailList)
def conn144(self):
conn = pymysql.Connect(host='114.115.159.144', port=3306, user='caiji', passwd='zzsn9988', db='caiji',
charset='utf8')
cursor = conn.cursor()
return conn,cursor
def dataToSql(self,detailmsg):
conn,cursor=self.conn144()
try:
# 检查记录是否存在
# 获取当前时间
current_time = datetime.datetime.now()
# 将时间转换为字符串
currentdate = current_time.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
print('+++++')
finally:
cursor.close()
conn.close()
# 将数据追加到excel
def writerToExcel(self,detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
print('保存成功!!')
if __name__ == '__main__':
wanfang=WanfangSpider()
for num in range(0,1321,100):
filename=f'企业_{num}.xlsx'
# # 创建一个工作簿
workbook = Workbook()
workbook.save(filename)
start=num
end=num+100
lsitmsg=wanfang.pageList(start,end)
for msg in lsitmsg:
wanfang.detailMsg(msg)
\ No newline at end of file
采集说明:
机构和企业的数据采集是从万方网站上获取冶金相关的机构和企业的基本信息
万方的地址:https://kms.wanfangdata.com.cn/
<img alt="img.png" height="200" src="img.png" width="200"/>
入口
地址
机构:https://kms.wanfangdata.com.cn/IndustryYJ/Search/Cecdb?q=%E5%86%B6%E9%87%91%2B%E5%86%B6%E7%82%BC%20%E6%9C%BA%E6%9E%84%3Acsi&f=Inst.Type
企业:https://kms.wanfangdata.com.cn/IndustryYJ/Search/Cecdb?q=%E5%86%B6%E9%87%91%2B%E5%86%B6%E7%82%BC%20%E6%9C%BA%E6%9E%84%3Acecdb&f=Inst.Type
<img alt="img_1.png" height="200" src="img_1.png" width="200"/>
主要是获取基本的信息
python的镜像源
-i https://mirrors.aliyun.com/pypi/simple
-i https://pypi.douban.com/simple
-i https://pypi.tuna.tsinghua.edu.cn/simple
fdfs上传附件
1.python使用的库是 pip install py3Fdfs
参考地址 : https://www.cnblogs.com/jrri/p/11570089.html
2.obs上传文件到华为云使用的库
pip install esdk-obs-python
# -*- coding: utf-8 -*-
import pymysql
import pandas as pd
from tqdm import tqdm
import xlsxwriter
import openpyxl
from urllib.parse import urlparse
def pipeiName(qiyedatas):
sql1 = """select id, info_source_code, web_site_name, site_name , site_uri from info_source WHERE web_site_name like '%[name]%' """
cont=1;
qynot=[]
qyin=[]
for qy in qiyedatas:
name=qy['name']
if name is None:
uqynot,uqyin=pipeiURL(qy)
if uqynot:
qynot.append(uqynot[0])
if uqyin:
qyin.append(uqyin[0])
continue
if ''==name:
uqynot,uqyin=pipeiURL(qy)
if uqynot:
qynot.append(uqynot[0])
if uqyin:
qyin.append(uqyin[0])
continue
try:
sql2=sql1.replace("[name]",name)
cursor.execute(sql2)
except Exception as e:
uqynot,uqyin=pipeiURL(qy)
if uqynot:
qynot.append(uqynot[0])
if uqyin:
qyin.append(uqyin[0])
continue
result_data = cursor.fetchall()
if(len(result_data)<1):
uqynot,uqyin=pipeiURL(qy)
if uqynot:
qynot.append(uqynot[0])
if uqyin:
qyin.append(uqyin[0])
else:
cont+=1
print(cont)
qyin.append(qy)
# for row2 in tqdm(result_data):
# try:
# rd = {'id': row2[0],
# '编码': row2[1],
# '网站名称': row2[2],
# '栏目名称': row2[3],
# '栏目地址': row2[4],
# '企业名称': qy['name']
# }
# qyin.append(rd)
# except Exception as e:
# print(e)
# print("查询失败!!"+sql2)
print(qyin)
df_in = pd.DataFrame(data=qyin)
df_in.to_excel('n企业情况在平台中有数据2.xlsx', engine='xlsxwriter', index=False)
print(qynot)
df_out = pd.DataFrame(data=qynot)
df_out.to_excel('n企业情况在平台中没有数据2.xlsx', engine='xlsxwriter', index=False)
def pipeiURL(qy):
uqynot=[]
uqyin=[]
url=qy['url']
sql1 = """select id, info_source_code, web_site_name, site_name , site_uri from info_source WHERE site_uri like '%[url]%' """
if url is None:
uqynot.append(qy)
return uqynot,uqyin
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc
if ''==domain:
uqynot.append(qy)
return uqynot,uqyin
except Exception as e:
uqynot.append(qy)
return uqynot,uqyin
sql2=sql1.replace("[url]",domain)
cursor.execute(sql2)
result_data = cursor.fetchall()
# if(len(result_data)>1):
if(len(result_data)<1):
uqynot.append(qy)
else:
uqyin.append(qy)
# for row2 in tqdm(result_data):
# try:
# rd = {'id': row2[0],
# '编码': row2[1],
# '网站名称': row2[2],
# '栏目名称': row2[3],
# '栏目地址': row2[4],
# '企业名称': qy['name']
# }
# uqyin.append(rd)
# except Exception as e:
# print(e)
# print("查询失败!!"+sql2)
return uqynot,uqyin
if __name__ == '__main__':
# 打开Excel文件
workbook = openpyxl.load_workbook('name.xlsx')
# 获取工作表对象
worksheet = workbook.active
qiyedatas=[]
# 遍历工作表的行
for row in worksheet.iter_rows(values_only=True):
qiyemsg={
'yname':row[0],
'name':row[1],
'url':row[2]
}
qiyedatas.append(qiyemsg)
# 打印每行的数据
# print(row)
conn = pymysql.Connect(host='114.116.44.11', port=3306, user='root', passwd='f7s0&7qqtK', db='clb_project',
charset='utf8')
cursor = conn.cursor()
pipeiName(qiyedatas)
# -*- coding: utf-8 -*-
import time
import urllib
import requests
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import wget
from openpyxl import Workbook
import pandas as pd
def createDriver():
chrome_driver =r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
path = Service(chrome_driver)
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
# 设置代理
# proxy = "127.0.0.1:8080" # 代理地址和端口
# chrome_options.add_argument('--proxy-server=http://' + proxy)
driver = webdriver.Chrome(service=path,chrome_options=chrome_options)
return driver
def listPage():
driver=createDriver()
for i in range(0,6):
size=i*20
url = f'https://www.bis.doc.gov/index.php/smart-search?searchword=Russia&searchphrase=all&start={size}'
driver.get(url)
html=driver.page_source
soup=paserUrl(html,url)
text=str(soup.prettify())
doc=pq(text)
titles=doc('dl[class="search-results"]>dt')
dates=doc('dl[class="search-results"]>dd[class="result-created"]')
for i in range(0,len(titles)):
detailList=[]
tt=titles[i]
dd=dates[i]
dddoc=pq(dd)
ttdoc=pq(tt)
title=ttdoc('a').text()
date=dddoc('dd[class="result-created"]').text()
url=ttdoc('a').attr('href')
pdfurl,content=detail(driver,url)
if pdfurl:
pdfpath="D:/cis/"+title+".pdf"
download_file(pdfurl,pdfpath)
else:
pdfpath=''
detailmsg={
"title":title,
"date":date,
"url":url,
"content":content,
"pdfurl":pdfurl,
"pdfpath":pdfpath,
}
detailList.append(detailmsg)
writerToExcel(detailList)
# print(f'title:{title} date:{date} url:{url}')
def detail(driver,url):
k=0
html=''
while k<5:
k+=1
try:
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
# response = requests.get(url, proxies=proxy, verify=False,timeout=10)
# html=response.text
driver.get(url)
time.sleep(3)
html=driver.page_source
soup=paserUrl(html,url)
html=str(soup.prettify())
except Exception as e:
html=''
if html:
break
text=paserUrl(html,url)
docc=pq(text.encode('utf-8'))
try:
pdfurl= docc('div[class="docman_download"]>a').attr('href')
except Exception as e:
pdfurl=''
try:
content=docc('div[class="item-page"]').text()
except Exception as e:
content=''
# print(url)
return pdfurl,content
# 将数据追加到excel
def writerToExcel(detailList):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
def download_file(url, save_path):
k=1
while True:
if k>5:
print(url)
break
k+=1
try:
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'b099bcecf0be876536bb9d4826b25ba8=e2horegllcbddejsiveijp7of0; cookiesession1=678A3E12247313FBD6F74569925F4EFD; _ga=GA1.1.840765784.1693040959; __cf_bm=4DTZeDEU67Xjr5nt9OsbE1g1UTdVuOGdQlhj4KD5U2I-1693190695-0-AW81rfvAFUnclDkFVJYqD8+RWrC8FngMzW0dJ+bVHA+JwmPUVpc9/ogA0jhXrKLFYWun2BoK0R/hqWgGZAw/I1Y=; referrer_site=https%3A%2F%2Fwww.bis.doc.gov%2Findex.php%2Fsmart-search%3Fsearchword%3DRussia%26searchphrase%3Dall; csrf_token=a0d03e256a36d037708a809220564f407dee78bc; _ga_TPRT7QB30Y=GS1.1.1693190696.4.1.1693190720.0.0.0',
'Host':'www.bis.doc.gov',
'Pragma':'no-cache',
'Referer':'https://www.bis.doc.gov/index.php/documents/product-guidance/3300-russia-medical-related-license-application-guidance-fpd-final-incorp-occ-and-3f-cmts-clean-071323',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
response = requests.get(url, proxies=proxy, headers=header, verify=False,timeout=10)
# response = requests.get(url,verify=False)
with open(save_path, 'wb') as file:
file.write(response.content)
break
except Exception as e:
time.sleep(5)
print(e)
return save_path
def download_file3(url, save_path):
header = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'b099bcecf0be876536bb9d4826b25ba8=e2horegllcbddejsiveijp7of0; cookiesession1=678A3E12247313FBD6F74569925F4EFD; _ga=GA1.1.840765784.1693040959; __cf_bm=4DTZeDEU67Xjr5nt9OsbE1g1UTdVuOGdQlhj4KD5U2I-1693190695-0-AW81rfvAFUnclDkFVJYqD8+RWrC8FngMzW0dJ+bVHA+JwmPUVpc9/ogA0jhXrKLFYWun2BoK0R/hqWgGZAw/I1Y=; referrer_site=https%3A%2F%2Fwww.bis.doc.gov%2Findex.php%2Fsmart-search%3Fsearchword%3DRussia%26searchphrase%3Dall; csrf_token=a0d03e256a36d037708a809220564f407dee78bc; _ga_TPRT7QB30Y=GS1.1.1693190696.4.1.1693190720.0.0.0',
'Host':'www.bis.doc.gov',
'Pragma':'no-cache',
'Referer':'https://www.bis.doc.gov/index.php/documents/product-guidance/3300-russia-medical-related-license-application-guidance-fpd-final-incorp-occ-and-3f-cmts-clean-071323',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'sec-ch-ua':'"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"'
}
req = urllib.request.Request(url, headers=header)
wget.download(url,save_path)
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
soup = BeautifulSoup(html, 'html.parser')
# 获取所有的<a>标签和<img>标签
links = soup.find_all(['a', 'img'])
# 遍历标签,将相对地址转换为绝对地址
for link in links:
if 'href' in link.attrs:
link['href'] = urljoin(listurl, link['href'])
elif 'src' in link.attrs:
link['src'] = urljoin(listurl, link['src'])
return soup
if __name__ == '__main__':
# # 创建一个工作簿
filename='cis.xlsx'
workbook = Workbook()
workbook.save(filename)
listPage()
# driver=createDriver()
# url='https://www.bis.doc.gov/index.php/policy-guidance/deemed-exports/deemed-exports-faqs/faq/116-what-areas-are-considered-russia-for-purposes-of-these-sanctions'
# detail(driver,url)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论