提交 6e37a78a 作者: LiuLiYuan

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	shenji/sclx.py
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
import pandas as pd
import redis
def putCom():
com_list = ['91210000558190456G', '914200001000115161', '911100007109310534', '9111000071093123XX',
'91110000100017643K', '91110000100018267J', '91110000MA01P657XY', '91230100127057741M',
'91440300190346175T', 'ZZSN22083000000003', '91110000400000720M', '911100001055722912',
'91110000100005220B', '911100001000094165', '91310000132200821H', '911100001000128855',
'91110000710924910P', '91110000710924929L', '911100007109225442', '9111000071092649XU',
'91310000MA1FL70B67', '911100007109311097', '912201011239989159', '911100007178306183',
'91310000MA7ALG04XG', '91110000100017707H', '91110000710929498G', '91110000100010249W',
'9151000062160427XG', '91310000MA1FL4B24G', '91110000400001889L', '9144030010001694XX',
'91110000100000825Q', '91110000100006194G', '91110000717828315T', '91110000100001043E',
'91110000MA005UCQ5P', '91110000710935732K', '91110000710930392Y', '91110000710930296M',
'911100007109303176', '91110000710925243K', '91110000100014071Q', '91110000100009563N',
'9111000071093107XN', '9111000010001002XD', '91110000100001852R', '91110000100001625L',
'911100001000080343', '91110000400008060U', '91110000101699383Q', '91110000100000489L',
'9111000071092868XL', '91110000100001035K', '911100004000011410', '91110000710933809D',
'91110000100010310K', '91133100MABRLCFR5Q', '91110000MA001HYK9X', '911100001000016682',
'911100007109279199', '12100000400010275N', '91110000710935636A', '91110000100024800K',
'9144000076384341X8', '91440000100005896P', '91110000MA01W8B394', '91110000717830650E',
'91110000100003057A', 'ZZSN22061600000001', '91310000MA1FL0LX06', '9111000010169286X1',
'91110000100010433L', '91110000100010660R', '91110000102016548J', '91110000100001676W',
'9111000071092200XY', '91133100MA0G9YKT8B', '9111000010000093XR', '91110000100006485K',
'91360702MA7FK4MR44', '91420100MA4L0GG411', '91110000101625149Q', '12100000400006022G',
'912302001285125661', '91110000100005888C', '911100007109250324', '91110000100024915R',
'9111000040000094XW', '91310000MA1FL1MMXL', '91110000100015058K', '91110000710929930X',
'91133100MA0GBL5F38', '9111000010000085X6', '91110000101100414N']
df = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx')
# 连接到Redis数据库
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
for i in range(len(df)):
social_code = df['social_code'][i]
com_name = df['name'][i]
# print(social_code)
if social_code in com_list:
pass
else:
if 'ZZSN' in social_code or 'ZD' in social_code:
continue
else:
item = social_code + '|' + com_name
r.rpush('UpdateBasdeInfo:SocialCode_CompanyName', item)
def putCom_task():
# 实例化一个调度器
scheduler = BlockingScheduler()
# 每个月执行一次
scheduler.add_job(putCom, 'cron', day=1, hour=0, minute=0)
try:
# redisPushData # 定时开始前执行一次
# putCom()
scheduler.start()
except Exception as e:
print('定时采集异常', e)
pass
if __name__ == '__main__':
putCom_task()
\ No newline at end of file
import pandas as pd
# from pandas import DataFrame as df
import pymysql
import redis
# 连接到Redis
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
with cnx.cursor() as cursor:
select = """select relationName, relationId from klb_company"""
cursor.execute(select)
results = cursor.fetchall()
for result in results:
name = result[0]
xydm = result[1]
item = f'{name}|{xydm}'
r.rpush('SousuoBaidu:companyname', cell_value)
# 列表名称
list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
# 获取列表中的所有元素
elements = r.lrange(list_name, 0, -1)
# 遍历列表中的元素
for element in elements:
# 获取元素在列表中的数量
count = r.lrem(list_name, 0, element)
# 如果数量大于1,说明有重复值,删除多余的重复值
if count > 1:
r.lrem(list_name, count - 1, element)
# 打印处理后的列表
print(r.lrange(list_name, 0, -1))
import pandas as pd
# from pandas import DataFrame as df
import pymysql
cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# df_all = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx', dtype=str)
with cnx.cursor() as cursor:
select = """select relationName, relationId from klb_company"""
cursor.execute(select)
results = cursor.fetchall()
# print(results)
for result in results:
name = result[0]
xydm = result[1]
with cnx.cursor() as cursor:
update = '''update sys_base_enterprise set name = %s where name is null and social_credit_code = %s'''
cursor.execute(update, (name, xydm))
cnx.commit()
print(f'{name}==={xydm}更新完成')
# list_xydm = ['91110000102017145R','911100001021096991','9111000010285973X7','91110000108283057Y','911100003180821571','91110000322283429E','91110000336431162N','911100005620857121','911100006000036940','911100006000107204','911100006000348885','911100006004827014','911100006337095702','91110000633713369X','911100006337942853','9111000063379674X4','9111000066990444XF','91110000672354637K','91110000700004889C','91110000700006921H','91110000700049024C','91110000700084217T','911100007001499141','9111000070038501XJ','91110000710923360K','91110000710924945A','911100007109255774','911100007177242684','91110000723951109B','91110000726360320G','91110000726362190T','911100007263731643','911100007423131451','91110000754166859U','91110000767525590U','91110000771589298U','911100007776681570','91110000783967006U','91110000802062000J','91110101101100895C','91110101335453570K','91110101355304193A','91110101579007657G','91110101783962889A','911101020592352188','911101021011011341','91110102634381829U','91110102674290067J','91110102685772854R','9111010278170742XX','91110102MA01FFJ36J','91110105051390889B','91110105101756720B','91110105306495333L','91110105306737662D','911101053179472352','91110105318247193G','91110105335500066Q','9111010535131161X2','91110105357967759L','91110105397625067T','91110105400614650L','91110105562128137P','91110105575219505U','911101055790551576','91110105585848161G','911101055938354164','911101055977289680','91110105600015572M','91110105625911031F','911101056336607540','91110105664618436J','91110105669928206J','91110105672840619D','91110105679620184F','911101056876404680','91110105690843864U','91110105726334827M','91110105756025873C','91110105756700197H','9111010576143898XE','91110105764202737L','91110105777670900X','91110105783991313X','911101057877635020','91110105790696320H','91110105801719541B','9111010580171955X3','9111010580177089XM','91110105802095822J','91110105MA002Q6M79','91110105MA003RD50R','91110105MA004C0H06','91110105MA00AGXN3L','91110105MA00FJHN72','91110105MA01AEWR5C','91110105MA01L9PH51','911101060695678147','911101060741434189','91110106101133080K','91110106306572212M','91110106351301243L','91110106567475437Y','911101065768942978','91110106585840012D','91110106587714554K','91110106593832696G','91110106633760720H','91110106633764772R','91110106663111019U','9111010667059416X2','91110106675098771D','911101066932508023','91110106749395454K','911101067533312850','9111010676504112XW','911101067855339571','911101068022066683','91110106MA0056B19T','91110106MA005DBW1G','91110106MA01P1RE3Y','91110107102288949G','91110107587683145R','91110107MA009GQ72T','911101080513793057','91110108061322142F','911101080627636876','91110108062782191G','911101080628016980','911101080649193741','91110108067265302X','91110108074122078Y','911101080765656577','91110108078505359A','91110108078545633M','911101080785732550','91110108089647010H','911101080918560737','91110108093369842B','91110108096441731D','911101080984827059','91110108099067984A','91110108099442801R','9111010810110401X3','911101081011420915','91110108101609659C','91110108101880422A','911101081020223907','9111010810202736X2','91110108102094378J','911101083065093288','91110108306623614J','91110108318056936P','91110108318058456U','9111010831813798XE','91110108327142377N','911101083271749266','911101083272391527','91110108335481926M','91110108335562435H','911101083363962058','911101083398292057','91110108344290793F','91110108344314759F','911101083443180558','911101083443783306','91110108344403743F','91110108355313321X','91110108400001643B','91110108551427625G','91110108554837179A','91110108554890762H','91110108560358422K','91110108560385447N','91110108560432856H','91110108562135265P','91110108563622495U','91110108565780884D','9111010856749593XU','911101085694855326','911101085694925139','91110108569524423F','911101085712035817','9111010857128414X8','911101085712845102','91110108576914390R','91110108576914817K','911101085790313156','911101085844819439','91110108585861972A','9111010858587583XQ','91110108587665983J','911101085890746187','91110108590662476F','911101085923662400','9111010859963405XW','91110108599644434U','91110108599663854W','911101086000694820','911101086003726929','91110108600404359L','91110108633708906M','91110108660513776K','91110108661550528Q','911101086615579497','91110108662151975E','91110108662170324C','911101086621777295','911101086631036849','911101086631154075','91110108663124944W','91110108663136638D','91110108664619674E','911101086656289355','91110108666258040N','911101086684483666','91110108671727577D','91110108672826657J','911101086738170589','9111010867662354XX','911101086766404898','91110108679604408D','91110108679611421U','911101086796241695','911101086804563776','91110108682894987G','911101086835621402','91110108686919328W','911101086883662373','91110108690011590J','91110108693213091F','911101086950387332','9111010869504894XN','91110108696323261L','91110108699627252X','91110108700235062K','91110108718777804Y','91110108718785556J','91110108722617934K','9111010872357215XK','91110108723952478G','911101087263410239','91110108733464566A','911101087355893625','9111010873559070X0','91110108735591489G','91110108737656338N','911101087376669155','91110108740421820F','91110108746113570P','91110108746729965F','91110108752161931Y','91110108753327825C','9111010875333972X7','9111010875467591XH','91110108754681201W','91110108758242935T','911101087582455976','911101087601419302','9111010876142254XU','91110108762181186P','911101087629781362','911101087635308194','91110108766287121Q','911101087684682847','91110108768471723F','91110108769354705D','91110108769356188B','91110108769900489W','911101087704233332','91110108770425654N','91110108771981556U','91110108771986242H','91110108773361465H','91110108773369432Y','911101087740615606','91110108774714285P','91110108775491714G','91110108776352708P','91110108777650264L','911101087776681301','911101087795289672','9111010878020592XF','91110108780217285R','91110108780238166U','91110108780955384Y','91110108780964686N','91110108781703664R','91110108782543551R','91110108783218849X','911101087839528242','911101087861701904','91110108790650445R','911101087921006070','911101087934019542','91110108794082078E','911101087951375794','91110108797552733T','91110108798525948B','911101087990254941','911101088011707638','91110108802021110U','911101088020333577','91110108802041787A','91110108802045657E','91110108802068007C','911101088020726207','91110108802109673L','91110108MA001N718J','91110108MA0021P69M','91110108MA002XL790','91110108MA003LNY5D','91110108MA003TAB64','91110108MA003YWP4D','91110108MA0043KP9E','91110108MA004F704R','91110108MA004LW69T','91110108MA004RAE05','91110108MA0068GY1F','91110108MA006K8Y3P','91110108MA0071CR55','91110108MA007H3P5K','91110108MA0086HR6G','91110108MA008DA429','91110108MA008HB66A','91110108MA008P9657','91110108MA008PK575','91110108MA0092QT4X','91110108MA00AGM13W','91110108MA00AU927M','91110108MA00DCJ01Y','91110108MA00DE1B2B','91110108MA00FA7E5C','91110108MA00GUD41A','91110108MA0188DW84','91110108MA018J4L08','91110108MA018MCC6M','91110108MA01BBB16K','91110108MA01BP1P7B','91110108MA01C8JR79','91110108MA01DMU77F','91110108MA01DNC75B','91110108MA01EGPQX2','91110108MA01RCWH0M','91110108MA01RWUG4Y','91110108MA01WQE10K','91110109330285061E','91110109567452606A','91110109590674493W','9111011159606037XJ','911101116812383633','91110111700001063P','91110111MA003JG31Y','91110111MA01L2H65N','91110112551358631R','91110112700216160K','911101127177330338','911101127400501696','911101127415832828','9111011276218407XN','91110112766758720D','911101127889851669','9111011279904576XL','91110113080516727E','91110113306541555R','91110113576855941L','911101136812208172','91110113696302276M','91110113741581703F','91110113752642938G','91110113756000350K','911101137577358263','91110113762992739Q','91110113MA001GWR0M','9111011408549335X2','9111011455135477XA','91110114582515556F','91110114589114325P','91110114590663348R','91110114600067778R','911101146796092682','911101146804798353','91110114682851688K','91110114685107782U','911101146900106275','911101147226688971','911101147426127944','91110114744716255J','9111011475010452XE','91110114750144214X','911101147667528632','911101147770556682','91110114MA001D4X3K','91110114MA01AEDF61','911101151016193470','911101151029162045','91110115576904205N','91110115733451490U','91110115736468984G','91110115746112690C','91110115MA0048EL1E','91110115MA017K5L4X','91110116064905925Y','911101165996396434','91110116767502874D','91110116MA005B3L58','91110116MA01C0AY5K','91110117330386452K','911102283272479535','911102283512805187','91110228582505681F','9111022867876096X3','911102287177842959','91110228754175237Y','91110228MA006GMF6R','9111030205136463XD','91110302053604529E','91110302057391444C','9111030210221806X9','911103021022784175','91110302306784047Q','911103025604366893','91110302565797010A','91110302565820110R','911103026003405002','91110302677444199R','911103026787533566','911103026857985287','91110302735090430Y','911103027493533932','911103027493534308','9111030276350109XG','9111030278250283XW','91110302801786752A','91110302MA0048YP1U','91110302MA005FFW29','91110302MA0066E64R','91110302MA008RUM5Y','91110302MA00AR3F76','91110302MA00B9G54G','91110302MA00B9MQ4G','91110302MA00BJ6B78','91110302MA00G8EH41','91110302MA00GQGB73','91110302MA00GRMLX4','91110302MA01AAXW1T','91110302MA01HEH15A','91110400MA029M4P80','91120000058736889L','91120000103069967Y','91120000103870914U','91120000741366579H','91120000761253280R','911200007676306733','91120000MA06F32U06','911201046630720486','91120104789385824Y','911201048034181441','91120110083028075A','91120110300659413H','91120111103789059M','911201116847488286','91120111697419046H','91120111722991870E','91120111741361313C','911201117925370324','91120112064042488E','91120112093771153W','911201127803488406','91120112MA05WM7M02','91120113079635948K','91120113660321205C','911201137303863474','91120113783335092P','911201160587336021','911201160612051730','91120116086586515N','91120116103481433E','91120116239661863L','91120116239663439U','91120116300452033U','911201163409833307','9112011655651308XJ','91120116562678278A','91120116586419887T','91120116592916759Q','91120116600910892X','911201166630834172','91120116671457175N','91120116675967105W','91120116697408240K','91120116712934952M','91120116718278597H','911201167244641345','91120116730357968N','911201167328190464','91120116735474530F','91120116746652267N','911201167491124502','91120116758137027D','91120116764348197P','911201167803339648','911201167833047124','91120116794980409G','911201167972829995','91120116MA05PQB5XT','91120116MA069EXE4T','91120116MA06DRM4XY','91120116MA0705BL96','911201186670532667','91120118735488182M','91120118762158867F','91120118MA05JQUK0G','91120118MA05QFTE3C','91120118MA05T81X8A','91120118MA0697LP9T','91120118MA06T62187','911202216877459052','91120222566105610W','91120222575108434H','911202227706300842','91120222MA05KHKY2P','91120222MA05UAG55H','91120223600894351U','91120223761280668D','91120224300621490X','911202245661215811','91120224681877747F','91120224700557176T','91120224MA07871882','9113000023565800XC','91130000752446136W','911301001044060055','91130100107744755W','91130100601090291K','91130100689298985P','91130100732910720N','91130100732914772Y','91130100745411306F','911301007468556979','91130100754027891A','91130100776179546U','91130100787019708G','91130101678512755X','91130101789833818T','91130104784084838J','911301257898318475','91130132MA0A7AYE2H','91130181791385313K','91130181791386236G','91130182685711699G','91130182791357005D','91130183575506723L','911301836870224839','911301837713256634','911301847216647980','911301856652827511','91130185669060689W','911301857233544863','9113018576519998X9','91130193074894510E','911301931078905417','911302005661986189','911302006799397935','9113020068276818X4','91130204347873513P','91130224666556267M','91130225MA07U3734B','91130229721600380L','91130281MA07KE3A17','91130282554499915A','91130282750290545E','91130282MA07P2E981','91130283052683448M','91130283601019508G','91130293096112137N','911302936746855014','91130293774420041F','91130293796568127H','91130294308381129A','91130300601108025E','91130301329656355R','91130301601147147J','91130301678536714D','9113030168136727XL','91130301MA08XEAB6Q','911303035673840924','91130303673240113T','91130392601151496U','91130400730275049G','91130400MA08CCBX29','91130405748493781M','91130407MA0CUE7R5R','91130408MA08XFQJ61','911304246843413669','91130424757510432X','91130426699207653P','91130429679913817F','91130434MA07R66J0A','91130435564863776A','911304816690569897','91130481721643479D','91130500769806003D','91130501693478268W','91130525721609633G','911305287343786273','91130528737368715C','91130528743430458K','91130532723397101T','91130582095633598M','911306007006711044','911306055728149239','91130605596834603J','91130605601201668M','911306056746516436','91130606550419199D','91130637752422695J','9113070556195375XQ','91130705769821035L','911307317870236272','91130800757548430L','91130802728832010D','91130803MA0CMRHN8L','91130824771327626D','9113090010971869X1','91130900670338967F','91130900700660368J','91130900765171063F','911309007681306540','91130900779198582P','91130900791398851A','911309033200553935','91130922687004365L','91130923732923871M','9113092510971914XX','91130927789824567A','911309313081379192','91130931329627183Q','91130931557675726N','91130931MA07MEM874','91130981596828756D','91130981763428435T','911309825809745213','911309826760246784','911309827401835863','91130983MA07N7T53G','91130984567358986H','911310001057748114','9113100060134890XT','911310007634343680','91131022336194910M','9113102258690708X9','91131025731429118G','9113102630827362XB','91131028MA07KQYW1M','91131081335912618X','9113108267322544XE','91131082755457551W','91131100109804512G','91131100700865494B','91131100746851872J','91131101236298229N','911311017913820594','91131101MA08EWC63Q','911311220826746736','91131125780842443M','91131127560486483D','911311816882161913','91131181700712973X','91131182093289869R','91131182109874836Y','91131182779189192D','91140000056278968H','91140000160963703Y','91140000330566883Q','911400005973987278','91140000701000732H','91140000715931861P','9114000074855218XX','91140100110047117B','91140100556560310M','91140100568462347Q','91140100578457859T','91140100586171535D','91140100713634804H','91140100713674988T','91140100715946502L','91140100754093899G','91140100792241864R','91140108731935643H','91140121757294792H','91140200770127753X','91140300694291892Q','91140300748578443L','91140311666645518N','91140322110721968E','911404001107700495','911405003257661198','91140500586185996N','91140522MA0JRG8Q99','911406006024604424','91140700065564755Y','91140700719819164X','91140800556559520F','91140800733994655W','911408007540500477','911408227011988570','9114082370110438XN','91140900798276152L','91140930794219089L','91140932729686916F','911500001141618816','91150000733284733B','91150100573268485R','911502047971536367','91150291594612345H','91150291701423911F','91150291787086089U','91150800701444800H','911525007116525588','91210000118887313L','91210000242666665H','91210000686609602P','91210000738792171J','912101000016232858','91210100117812926M','91210100243490227Y','912101006046149869','912101006625215774','91210100769563590L','91210100798474220Q','912101047845707057','91210105057192314D','91210105564689755B','91210105798464057N','912101063132548617','912101066671654449','912101066874643611','912101067386643481','91210106755504303X','91210106760090619H','91210106769599542Y','91210112675348347K','9121011271579529XH','91210112738671871J','91210112769598654A','91210112MA0P432U8R','91210112MA0P44NRXH','91210113088956102C','91210113578366586N','912101137555387734','91210113760060444N','91210114340680807E','91210181MA0XQF19XH','9121020011831278X6','91210200118561313C','91210200241297917U','912102006048648626','91210200677529168F','91210200716992578X','91210200723495318L','912102007409045158','91210200744362020N','91210200751579797A','91210200MA0TR2P80G','91210204MA0QDTY23G','91210211MA0Y19KN3J','91210212732749973K','91210213604838795D','912102137327794199','912102137920497177','91210231736407196M','91210231756073509F','91210242118382526E','91210242728848952B','91210244559824828B','91210283696011524C','91210300MA0TT2DH9R','91210381241525115T','91210381567557686J','91210381603655081B','9121060008113718XY','91210600120109772C','91210600242814525N','912106005909453539','91210682781643139U','912107000721599341','912107002420322837','91210700577233300Y','91210700736737822M','91210700749779175E','91210700768337030B','912108006768912029','912108006926672350','91210900584194995N','912109007016848390','91210921MA0XX7NM2T','9121100059093999XJ','91211000726845918Y','9121100474278967XK','91211021590945396H','91211103MA102UN249','91211200561382299U','912112006737775195','9121122106407122X3','91211300791572581J','9121140055815624XQ','91211400747119974B','91220000664275090B','9122000070222720XH','91220000786819498L','91220101050518975F','91220101081849654U','912201012449758167','91220101310012867G','91220101697761845P','91220101727117306C','91220101730777372U','912201017484274776','912201017561541220','912201017671658636','912201017671930129','912201017710567829','91220101794404583W','91220101794442483P','91220101MA14TY564H','91220104675648489W','912202011239483018','912202016051690282','91220201682611844F','91220201786812798U','91220300565092475E','91220422MA0Y3F777J','91220501126870028U','91220501244575134M','91220501244583871H','91220501723101462L','91220501791105350N','91220521MA173E261W','91220523660141001R','9122082155977797XR','912224037742347248','912301001275921118','91230100607168790X','91230108127420096N','91230108MA1B0JHY73','912301107631541551','912301993011658539','91230199301195470J','91230199301211856H','9123019958512805X3','91230199696825683P','91230199775036754Q','91230199MA18XA396K','91230300130721906W','91230600560617893B','91230600MA1BF4U29A','912306056926467095','912312815838229881','91233001569893325G','91310000051240362X','9131000005124956XX','9131000005304658XH','91310000057656705A','913100000625940784','91310000076492259A','91310000080013687R','913100001321244277','913100001321644452','913100001322131129','91310000132231361P','91310000132653687M','91310000132660318J','9131000013297865X2','9131000013413459XC','91310000300253536H','91310000301354857P','91310000312519282U','91310000324299264L','913100003244893596','91310000342056098N','913100003508461023','91310000351008055W','913100005515491712','91310000552962929G','91310000557430243L','913100005665114915','91310000568072146K','913100005708082124','91310000572698184Q','91310000582138631D','91310000582139781F','913100005867988561','91310000590384058P','91310000590397350D','913100006072612077','91310000607272280Q','91310000607286404W','913100006072944121','9131000060729499X9','91310000607311067X','91310000607339123C','913100006073622866','91310000607370331G','913100006073785958','91310000607403041J','91310000607404087G','91310000607422576R','913100006074261470','91310000607431720X','9131000060751688XT','913100006075916282','91310000607601064L','9131000063021103X7','91310000630453442X','91310000630483465G','91310000630948912G','91310000630965915K','91310000631137409B','91310000631191552K','913100006314149553','913100006314627462','91310000631521822M','91310000631534594F','913100006317557680','9131000066240918XU','91310000667780236Q','91310000669359189D','91310000669363292T','91310000669421384T','913100006711091037','91310000674575425N','913100006746031318','91310000680976508E','91310000680999558Q','91310000682254509X','91310000682263886E','913100006873885738','91310000690125272H','91310000692998798F','9131000069420172XB','9131000069578172XC','91310000695810746C','91310000697295223K','913100006988365624','913100007030116706','913100007030557379','913100007030973396','91310000703147746G','91310000703340159B','913100007294735903','91310000729493479N','91310000733344636F','91310000733365971U','91310000734057153P','91310000734081815D','91310000735408592G','913100007381411253','91310000738505304H','913100007385256042','913100007421053624','91310000747273971D','91310000748756174J','913100007487913409','91310000751468181F','91310000751863771N','913100007518999777','91310000756110429R','9131000075842961XY','9131000075855850XT','91310000759040681R','91310000761199691M','913100007653010244','91310000765583375Y','91310000765596096G','91310000768354199F','913100007694197083','91310000770201458T','913100007714584745','91310000772115131G','91310000772864810L','913100007728924912','91310000773282177G','913100007743059833','91310000774323671U','9131000077478390X5','91310000775216587B','91310000775238065L','913100007757838991','91310000778930516R','913100007824379352','91310000784298270U','91310000784783241W','91310000787230976G','91310000787878254Q','91310000792703993P','91310000792783700P','91310000797050338W','91310000MA1FL74J78','91310000MA1G8BHPXX','91310000MA1H38T58K','91310000MA1H3GDC5H','91310000MA1J37FN5Q','91310000MA1K2Q6J2X','91310000MA1K35P57Y','91310000MA7CJ9P40C','913101040693974723','91310104301579458U','91310104342172646H','913101043423482187','9131010455298989X1','913101045758452582','9131010458529260X8','91310104669392966T','91310104674626798A','913101046855187256','91310104692921256Y','91310104742657562G','91310104776270040D','91310104MA1FR0P33B','91310104MA1FR9PL54','91310105074824416U','91310105090037252C','91310105312284129D','91310105779753697E','91310106066020397Q','91310106550090004W','91310106630236093C','91310106MA1FY9LT3N','91310107051295590B','913101076076323035','91310107781531233F','91310109312143131N','91310110054590464F','91310110078155571L','913101100861724784','91310110342313605X','913101103507613521','91310110351027504X','91310110351154941K','91310110591673062R','91310110757926286X','91310110787862412B','9131011205506145X2','913101121326732580','91310112301708379M','91310112350881637E','91310112350889276J','91310112351114237B','91310112557480662J','913101125868251134','91310112607425988Y','91310112607671054B','91310112630792962D','91310112767225977D','913101127785041388','913101127851867808','91310112789576698P','91310112MA1GB5HL74','91310112MA1GB63D5E','91310112MA1GBCU74Q','91310112MA1GBEPY9P','91310112MA1GBWLUXN','91310112MA1GC28U0L','91310112MA1GC78A07','91310112MA1GCHQP57','91310113086201072B','913101131345344112','91310113342290888U','91310113550058717X','91310113631482720W','91310113754764752Y','913101140693041410','91310114074811922G','91310114320742767K','9131011434217342X2','9131011455159938XA','913101145529068046','913101145619308064','91310114570796872F','91310114577469866W','9131011458207544XY','91310114588740092M','91310114591692730A','91310114630211689Y','913101146305896733','913101146308058904','913101146315357223','913101146319344919','91310114695793034W','913101147030104249','91310114754758651R','913101147557198576','91310114760573215T','913101147728614257','91310114781898318F','91310114MA1GT4926T','91310114MA1GURM19N','91310114MA1GW61HX2','91310114MA1GWJL62M','91310115051251125K','913101150512565326','913101150608727672','913101150609007219','91310115080028627C','9131011508201988XX','91310115084100518T','913101150938266958','913101151321295193','913101151339870722','913101153124932461','9131011532075221XC','91310115324253960J','91310115324284513E','9131011533262045X2','91310115332642560W','913101153986795507','91310115555949711X','91310115568057640Y','91310115569630816D','91310115570750452T','91310115572703801L','91310115599770596C','913101156822157531','91310115690170444F','913101156972022424','91310115703497359F','91310115767236430H','913101157732980993','91310115779776581R','91310115792736664G','91310115795654795D','91310115MA1H70PK5R','91310115MA1H727E7E','91310115MA1H7RLE45','91310115MA1H7W8439','91310115MA1H7W8514','91310115MA1H9HD02E','91310115MA1H9K3FX7','91310115MA1HATB40R','91310115MA1K39C71R','91310115MA1K3B1R09','91310115MA1K3BQK2U','91310115MA1K3CM30B','91310115MA1K3F6C05','91310115MA1K3K2N9H','91310115MA1K3KJW0N','91310115MA1K3MP458','91310115MA1K41R2X3','91310115MA1K493TXQ','91310115MA1K4CLB55','91310115MA1K4MF39X','913101160637712405','913101161321521531','91310116555985835J','913101165601545691','91310116563135240C','91310116566563515F','913101166607195719','91310116662495241T','91310116671156516L','91310116676273009U','91310116687330646Y','91310116759882926H','913101175559503333','9131011756018678XG','91310117574182309H','91310117598194355D','913101176311798956','91310117632167028T','91310117662458598U','91310117662473499L','91310117662485385P','91310117669377619D','913200001347587142','91320000583783720B','913200006082630012','91320000710929340E','91320000743141824Y','91320000751254554N','91320100093975981A','91320100134974572K','91320100135847161T','913201025628951334','91320191134955910F','91320191726079387X','913201921349556628','913201922497944756','91320200135890776N','91320200135914870B','913204002508323014','91320500741304044W','91320509138285715E','91320509796141166A','91320582134789270G','913207007322513070','91320982571427139M','91321291703974741U','913300001429120051','9133000014293866XE','91330000142941287T','91330000710924531U','91330100253930310D','91330110MA2CGBC056','913302001440685655','91330200704800698F','91340000148941616G','91370112MABYCTU036']
# for xydm in list_xydm:
# for num_df in range(len(df_all)):
# social_code = str(df_all['social_code'][num_df])
# if social_code == xydm:
# com_name = str(df_all['name'][num_df])
# with cnx.cursor() as cursor:
# update = '''update sys_base_enterprise set name = %s where social_credit_code = %s'''
# cursor.execute(update, (com_name, xydm))
# cnx.commit()
# print(f'{xydm}===更新成功{com_name}')
# break
\ No newline at end of file
......@@ -45,6 +45,9 @@ def get_html(tycid, s, headers):
# div_part.find('div', class_='dimHeader_root__XTCLe')
except:
return -1
if div_part is None:
return -2
else:
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
......@@ -64,7 +67,10 @@ def get_page(url, s, headers):
if res.status_code != 200:
raise
data_page = res.json()
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_
......@@ -77,11 +83,12 @@ def doJob():
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
# 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie = token.get_cookies()
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
......@@ -90,7 +97,7 @@ def doJob():
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
# social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
# 判断 如果Redis中已经没有数据,则等待
social_code = '911101067916069050'
social_code = '91110108780992804C'
if social_code == None:
time.sleep(20)
continue
......@@ -163,6 +170,11 @@ def doJob():
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
continue
elif charge == -2:
# 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
......@@ -240,6 +252,8 @@ def doJob():
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
res.close()
if flag == 1:
for one_info in list_all:
name = one_info['name']
......
"""
天眼查人员信息
问题1:页面和接口数据不一致 目前方法 单独处理
问题2:页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
"""
import datetime
import json
import requests, time
from bs4 import BeautifulSoup
import urllib3
from retry import retry
from base.BaseCore import BaseCore
from getTycId import getTycIdByXYDM
baseCore = BaseCore()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
list_all_1 = []
list_all_2 = []
taskType = '天眼查/核心人员更新'
from lxml import etree
from classtool import Token, File, Tag
token = Token()
@retry(tries=3, delay=1)
def get_html(tycid, s, headers):
url = f"https://www.tianyancha.com/company/{tycid}"
# ip = baseCore.get_proxy()
response = s.get(url=url, headers=headers)
if response.status_code == 200:
pass
else:
raise
# return -1
soup = BeautifulSoup(response.content, 'html.parser')
try:
div_part = soup.find('div', attrs={'data-dim': 'staff'})
# div_part.find('div', class_='dimHeader_root__XTCLe')
except:
return -1
if div_part is None:
return -2
else:
try:
tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
if '最新公示' in tmp_field:
total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
return int(total)
else:
return -1
except:
return 0
@retry(tries=3, delay=1)
def get_page(url, s, headers):
ip = baseCore.get_proxy()
res = s.get(url=url, headers=headers, proxies=ip)
time.sleep(1)
if res.status_code != 200:
raise
data_page = res.json()
try:
total_page_ = data_page['data']['total']
except:
raise
return total_page_
def doJob():
# for social_code in social_code_list:
while True:
# todo:设置cookies的使用
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
# 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'version': 'TYC-Web'
}
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
cookies = {}
for cookie in cookies_list:
cookies[cookie['name']] = cookie['value']
s = requests.Session()
s.cookies.update(cookies)
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
# 判断 如果Redis中已经没有数据,则等待
# social_code = '91110108780992804C'
if item == None:
time.sleep(20)
continue
start = time.time()
social_code = item.split('|')[0]
try:
data = baseCore.getInfomation(social_code)
if len(data) != 0:
id = data[0]
com_name = data[1]
xydm = data[2]
tycid = data[11]
count = data[17]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
pass
else:
#数据库中并没有该企业 需要新增
pass
id = data[0]
com_name = data[3]
xydm = data[1]
conut = 0
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name, xydm))
cnx_.commit()
tycid = ''
if tycid == None or tycid == '':
try:
retData = getTycIdByXYDM(com_name, s)
if retData['state']:
tycid = retData['tycData']['id']
# # todo:写入数据库
updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
cursor_.execute(updateSql)
cnx_.commit()
else:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
log.info(f'======={social_code}====重新放入redis====')
baseCore.rePutIntoR('UpdateCoreperson:Error', item)
continue
except:
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
baseCore.rePutIntoR('UpdateCoreperson:Error', item)
continue
count = data[17]
log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
list_one_info = []
num = 1
try:
charge = get_html(tycid, s, headers)
# 页面请求三次都失败
except:
charge = -1
t = int(time.time() * 1000)
if charge == -1:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
time.sleep(2)
continue
elif charge == -2:
# 该企业没有人员信息
log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
continue
elif charge == 0:
log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page1 = get_page(url1, s, headers)
except:
total_page1 = 0
url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page1
flag = 2
else:
log.info(f"{id}---{xydm}----{tycid}----有最新公示")
url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
try:
total_page2 = get_page(url2, s, headers)
except:
total_page2 = 0
time.sleep(1)
try:
total_page3 = get_page(url3, s, headers)
except:
total_page3 = 0
if total_page2 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page2
flag = 1
else:
if total_page3 == charge:
url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
total_page = total_page3
flag = 3
else:
total_page = 0
flag = 0
baseCore.rePutIntoR('UpdateCoreperson:Map', item)
log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应---{charge}---{total_page2}---{total_page3}')
continue
if total_page == 0:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
continue
# # todo:获取页数
# total_page = 34
# flag = 2
# todo: 测试程序是否执行到这一步
log.info(f'总数为{total_page}')
for page in range(1, int((total_page / 20) + 1) + 1):
res = None
for c in range(3):
ip = baseCore.get_proxy()
url_ = url.format(t, tycid, page)
# url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
res = requests.get(url_, headers=headers, proxies=ip, verify=False) # ,verify=False
time.sleep(1)
if res.status_code == 200:
break
else:
if c == 2:
break
continue
if res:
pass
else:
token.updateTokeen(id_cookie, 2)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
continue
# todo:test测试
log.info(f'{id}---{xydm}----{tycid}----{res.json()}')
try:
list_all = res.json()['data']['dataList']
except:
list_all = res.json()['data']['result']
if list_all:
pass
else:
log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
# todo: 关闭连接
res.close()
if flag == 1:
for one_info in list_all:
name = one_info['name']
sex = one_info['sex']
education = one_info['education']
position = one_info['position']
Salary = one_info['salary']
# todo:获取当前年份
now = datetime.datetime.now()
year = now.year
try:
birthYear = year - int(one_info['age'])
except:
birthYear = ''
StockKeepings = one_info['numberOfShares']
currentTerm = one_info['term']
personInfo = one_info['resume']
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": StockKeepings,
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm,
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# list_all_2.append(dic_json_img)
elif flag == 3:
for one_info in list_all:
name = one_info['personal_name']
try:
sex = one_info['gender2']
except:
sex = ''
education = ''
position = one_info['position_name']
Salary = ''
try:
birthYear = one_info['year_of_birth']
except:
birthYear = ''
personInfo = one_info['resume_cn']
try:
timestamp = int(one_info['employ_date']) / 1000
currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
except:
currentTerm = ''
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": sex,
"education": education,
"position": position,
"salary": Salary,
"birthYear": birthYear,
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": currentTerm + '至-',
"personInfo": personInfo,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
else:
for one_info in list_all:
name = one_info['name']
try:
position = one_info['typeSore']
except:
position = ''
person_id = one_info['id']
person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
# person_res = requests.get(person_url, headers=headers, proxies=ip)
person_res = requests.get(person_url, headers=headers)
person_soup = BeautifulSoup(person_res.content, 'html.parser')
try:
personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
except:
personInfo = ''
try:
person_img = one_info['logo']
except:
person_img = '--'
dic_json = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"sort": str(num)
}
dic_json_img = {
"socialCreditCode": social_code,
"name": name,
"sex": '',
"education": '',
"position": position,
"salary": '',
"birthYear": '',
"shareNum": '',
"shareRatio": '',
"benefitShare": '',
"currentTerm": '',
"personInfo": personInfo,
"头像": person_img,
"sort": str(num)
}
num = num + 1
list_one_info.append(dic_json)
# print(list_one_info)
json_updata = json.dumps(list_one_info)
if json_updata == '[]':
continue
else:
pass
# response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
# verify=False)
# print(response.text)
log.info('=========成功======')
token.updateTokeen(id_cookie, 3)
time.sleep(10)
except Exception as e:
log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
log.info(e)
# 重新塞入redis
baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
state = 0
takeTime = baseCore.getTimeCost(start, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
time.sleep(5)
break
# df_img = pd.DataFrame(list_all_2)
# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
if __name__ == "__main__":
doJob()
\ No newline at end of file
# -*- coding: utf-8 -*-
import datetime
import json
import re
import time
......@@ -13,7 +14,7 @@ from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from dateutil.relativedelta import relativedelta
import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
......@@ -56,6 +57,12 @@ def sendkafka(post_data):
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败")
def Lreputredis(company_field):
# todo: 重新放入redis
baseCore.r.lrem('BaseInfoEnterprise:gnqy_socialCode', 0, 'end')
baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', 'end')
# 合并基本信息和工商信息字段
def getinfo(dict1,dict2):
# 取出两个字典的key值集合
......@@ -320,17 +327,18 @@ def dic_handle(result_dic):
}
return aa_dict
# 检查登陆状态
def checklogin(key):
t = int(time.time())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
# ip = baseCore.get_proxy()
# req = requests.get(headers=headers, url=url, proxies=ip)
req = s.get(headers=headers, url=url)
time.sleep(1)
soup = BeautifulSoup(req.content, 'html.parser')
driver.get(url)
time.sleep(2)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# todo:检查未登录状态
# if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录')
......@@ -350,7 +358,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
soup = checklogin(com_name)
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
......@@ -359,18 +368,23 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
try:
searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
if searchinfo == '0':
try:
# todo:可能是搜不到该企业
errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
if '抱歉' in errormessage:
log.info('=====搜索不到该企业====')
data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中
file.appenddata(file_name, '需处理企业', data)
return count
except:
log.info("登录失效===重新放入redis")
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
else:
# 开始采集
try:
......@@ -383,16 +397,17 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return count
def ifbeforename(company_url):
req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
time.sleep(2)
com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
......@@ -408,15 +423,84 @@ def ifbeforename(company_url):
else:
return ''
#解析时间
def paserTime(publishtime):
timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now()
publishtime = publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
# delta = datetime.timedelta(months=day)
publishtime = current_datetime - relativedelta(months=day)
# publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
if '小时' in publishtime:
hour = publishtime.split("小时")[0]
else:
hour = 0
if hour != 0:
min = publishtime.split("小时")[1].split("分钟")[0]
else:
min = publishtime.split("分钟")[0]
delta = datetime.timedelta(hours=int(hour), minutes=int(min))
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
# 采集基本信息和工商信息
def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====')
req_ = s.get(headers=headers, url=company_url)
com_soup = BeautifulSoup(req_.content, 'html.parser')
driver.get(company_url)
# req_ = s.get(headers=headers, url=company_url)
page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
try:
sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
pattern = r'\d{4}-\d{2}-\d{2}'
matched = re.findall(pattern, sourceUpdateTime_)
if matched:
sourceUpdateTime = sourceUpdateTime_
else:
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
except:
log.info(f'天眼查无该企业{social_code}')
return
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
......@@ -502,50 +586,55 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
print(aa_dic)
# sendkafka(aa_dic)
# print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
else:
data_baseinfo = baseinfo(com_soup)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
data_baseinfo['统一社会信用代码']
except:
log.info('未获取到统一社会信用代码')
if social_code:
data_baseinfo['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
if data_baseinfo['企业名称'] == '-' and com_name:
data_baseinfo['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# 采集成功的企业
data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(data_baseinfo)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
aa_dic['ynDomestic'] = ynDomestic
aa_dic['countryName'] = countryName
aa_dic['securitiesCode'] = securitiesCode
aa_dic['securitiesShortName'] = securitiesShortName
aa_dic['listingDate'] = listingDate
aa_dic['category'] = category
aa_dic['exchange'] = exchange
aa_dic['listingType'] = listType
# sendkafka(aa_dic)
print(aa_dic)
post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
dic_info = json.dumps(aa_dic)
req = requests.post(post_url, data=dic_info)
# todo: 重新放入redis 删除end再放入ruend
# baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
Lreputredis(company_field)
log.error(f'未找到工商信息,重新塞入redis')
# data_baseinfo = baseinfo(com_soup)
# # 主要针对香港台湾企业,社会信用代码传为给定的
# try:
# data_baseinfo['统一社会信用代码']
# except:
# log.info('未获取到统一社会信用代码')
# if social_code:
# data_baseinfo['统一社会信用代码'] = social_code
# else:
# # 如果未给定社会信用代码,则返回
# return False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
# if data_baseinfo['企业名称'] == '-' and com_name:
# data_baseinfo['企业名称'] = com_name
# elif not com_name:
# return False
# else:
# pass
# # 采集成功的企业
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
# file.appenddata(file_name, '获取基本信息成功企业', data)
# # 将字段转化成英文驼峰
# aa_dic = dic_handle(data_baseinfo)
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
# aa_dic['qccId'] = qccid
# aa_dic['ynDomestic'] = ynDomestic
# aa_dic['countryName'] = countryName
# aa_dic['securitiesCode'] = securitiesCode
# aa_dic['securitiesShortName'] = securitiesShortName
# aa_dic['listingDate'] = listingDate
# aa_dic['category'] = category
# aa_dic['exchange'] = exchange
# aa_dic['listingType'] = listType
# # sendkafka(aa_dic)
# print(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def remove_parentheses(text):
# 清除中文小括号
......@@ -561,7 +650,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return False
......@@ -623,11 +713,30 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
return False
return True
def login():
# time.sleep(10)
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(5)
return driver,id_cookie
if __name__ == '__main__':
taskType = '基本信息/天眼查'
# driver, id_cookie = login()
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
driver, id_cookie = login()
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file.createFile(file_name)
......@@ -644,7 +753,7 @@ if __name__ == '__main__':
# cookies = {}
# for cookie in cookies_list:
# cookies[cookie['name']] = cookie['value']
s = requests.Session()
# s = requests.Session()
# s.cookies.update(cookies)
start_time = time.time()
# 获取企业信息
......@@ -674,7 +783,8 @@ if __name__ == '__main__':
if company_field:
flag = False
log.info("-----已添加数据------")
baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
# baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
Lreputredis(company_field)
continue
continue
# company_field_ = f'|{company_field}'
......@@ -701,7 +811,7 @@ if __name__ == '__main__':
count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
listType, ynDomestic, countryName, file_name)
time.sleep(10)
# break
break
# baseCore.r.close()
# baseCore.sendEmail(file_name)
# 信息采集完成后将该企业的采集次数更新
......
# -*- coding: utf-8 -*-
import json
import re
import time
import datetime
import pymongo
import requests
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta
from kafka import KafkaProducer
import urllib3
from retry import retry
from selenium.webdriver.support.wait import WebDriverWait
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
'天眼查登录信息']
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import sys
# sys.path.append('D:\\KK\\zzsn_spider\\base')
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = baseCore.cnx_
cursor = baseCore.cursor_
log = baseCore.getLogger()
from classtool import Token, File, Tag
token = Token()
file = File()
tag = Tag()
from selenium import webdriver
from selenium.webdriver.common.by import By
def create_driver():
path = r'D:\soft\msedgedriver.exe'
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
}
session = webdriver.Edge(executable_path=path, capabilities=options)
return session
# 发送数据
def sendkafka(post_data):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
kafka_result = producer.send("enterpriseInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
except:
exception = 'kafka传输失败'
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
log.info(f"{com_name}--{social_code}--kafka传输失败")
# 检查登陆状态
def checklogin(key):
t = int(time.time())
# url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
driver.get(url)
time.sleep(2)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# todo:检查未登录状态
# if soup.find('title').text == '会员登录 - 企查查':
# log.info('状态---未登录')
# soup = ''
# return soup
return soup
# 合并基本信息和工商信息字段
def getinfo(dict1,dict2):
# 取出两个字典的key值集合
keys1 = set(dict1.keys())
keys2 = set(dict2.keys())
# 取出并集
union_keys = keys1 | keys2
# 根据并集的key值,从两个字典中取出value值,组成新的字典
result_dict = {key: dict1.get(key, None) or dict2.get(key, None) for key in union_keys}
return result_dict
def dic_handle(result_dic):
zxss = ['北京市', '天津市', '上海市', '重庆市']
try:
company_name = result_dic['企业名称']
except:
company_name = None
try:
CreditCode = result_dic['统一社会信用代码']
except:
CreditCode = None
try:
OperName = result_dic['法定代表人']
except:
OperName = None
try:
PhoneNumber = result_dic['电话']
except:
PhoneNumber = None
try:
WebSite = result_dic['网址']
except:
WebSite = None
try:
Email = result_dic['邮箱']
except:
Email = None
try:
Desc = result_dic['简介']
except:
Desc = None
try:
Status = result_dic['经营状态']
except:
try:
Status = result_dic['公司现状']
except:
Status = None
try:
StartDate = result_dic['成立日期']
except:
StartDate = None
try:
RecCap = result_dic['实缴资本']
except:
RecCap = None
try:
RegistCapi = result_dic['注册资本']
except:
RegistCapi = None
try:
CheckDate = result_dic['核准日期']
except:
CheckDate = None
try:
OrgNo = result_dic['组织机构代码']
except:
OrgNo = None
try:
No = result_dic['工商注册号']
except:
No = None
try:
taxpayerNo = result_dic['纳税人识别号']
except:
taxpayerNo = None
try:
EconKind = result_dic['企业类型']
except:
EconKind = None
try:
TermStart = result_dic['营业期限'].split('至')[0]
except:
TermStart = None
try:
TeamEnd = result_dic['营业期限'].split('至')[1]
except:
TeamEnd = None
try:
TaxpayerType = result_dic['纳税人资质']
except:
TaxpayerType = None
try:
SubIndustry = result_dic['国标行业']
except:
SubIndustry = None
# try:
# region = result_dic['所属地区']
# except:
# region = None
# try:
# pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
# matches = re.match(pattern, region)
# Province = matches.group(1)
# City = matches.group(2)
# County = matches.group(3)
# if Province is None:
# for zxs in zxss:
# if zxs in region:
# Province = zxs
# break
# except:
# Province = None
# City = None
# County = None
try:
BelongOrg = result_dic['登记机关']
except:
BelongOrg = None
try:
Info = result_dic['人员规模']
except:
Info = None
try:
can_bao = result_dic['参保人数']
except:
can_bao = None
try:
OriginalName = result_dic['曾用名']
except:
OriginalName = None
try:
EnglishName = result_dic['英文名称']
except:
EnglishName = None
try:
IxCode = result_dic['进出口企业代码']
except:
IxCode = None
try:
Address = result_dic['地址']
except:
Address = None
try:
Scope = result_dic['经营范围']
except:
Scope = None
aa_dict = {
'name': company_name, # 企业名称
'shortName': None, # 企业简称
'socialCreditCode': CreditCode, # 统一社会信用代码
'legalPerson': OperName, # 法定代表人
'officialPhone': PhoneNumber, # 电话
'officialUrl': WebSite, # 官网
'officialEmail': Email, # 邮箱
'briefInfo': Desc, # 简介
'registerStatus': Status, # 登记状态
'incorporationDate': StartDate, # 成立日期
'capital': RegistCapi, # 注册资本
'paidCapital': RecCap, # 实缴资本
'approvalDate': CheckDate, # 核准日期
'organizationCode': OrgNo, # 组织机构代码
'registerNo': No, # 工商注册号
'taxpayerNo': taxpayerNo, # 纳税人识别号
'type': EconKind, # 企业类型
'businessStartDate': TermStart, # 营业期限自
'businessEndDate': TeamEnd, # 营业期限至
'taxpayerQualification': TaxpayerType, # 纳税人资质
'industry': SubIndustry, # 所属行业
'region': None,
'province': None, # 所属省
'city': None, # 所属市
'county': None, # 所属县
'registerDepartment': BelongOrg, # 登记机关
'scale': Info, # 人员规模
'insured': can_bao, # 参保人数
'beforeName': OriginalName, # 曾用名
'englishName': EnglishName, # 英文名
'importExportEnterpriseCode': IxCode, # 进出口企业代码
'address': Address, # 地址
'businessRange': Scope, # 经营范围
'status': 0, # 状态
}
return aa_dict
# 获取基本信息
def baseinfo(com_soup):
baseinfo = com_soup.find('div', class_='index_detail__JSmQM')
cominfo_list = baseinfo.find_all('div', class_='index_detail-info-item__oAOqL') #name
data = {}
for cominfo in cominfo_list:
name = cominfo.find('span', class_='index_detail-label__oRf2J').text.replace(':', '').replace(' ', '')
# print(name)
tag.deletep(cominfo, 'span', 'class', 'index_detail-label__oRf2J')
tag.deletep(cominfo, 'i', 'class', 'index_detail-text-desc__myXYK')
# print(info)
value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
if name == '法定代表人':
try:
value = cominfo.find('a').text
except:
value = None
if name == '电话':
try:
value = cominfo.find('span').text
except:
value = None
if name == '邮箱':
try:
value = cominfo.find('a').text
except:
value = None
if name == '网址':
try:
value = cominfo.find('a').text
except:
value = None
if name == '地址':
try:
value = cominfo.find('span').text
except:
value = None
data[name] = value
# print("==================")
briefTag = baseinfo.find('div', class_='index_detail-linewrap__AKtCa index_-intro__ma3Qd')
span_list = briefTag.find_all('span')
for span in span_list:
if len(span.attrs) == 0:
data['简介'] = span.text.split('通过天眼查大数据分析')[0]
break
return data
# 采集准备
def redaytowork(com_name, social_code, file_name):
log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
count = 0
# 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
if social_code:
soup = checklogin(social_code)
else:
soup = checklogin(com_name)
if not soup:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
# token.updateTokeen(id_cookie,2)
# log.info('=====已重新放入redis,失效cookies已删除======')
time.sleep(20)
return count
else:
try:
searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
except:
try:
# todo:可能是搜不到该企业
errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
if '抱歉' in errormessage:
log.info('=====搜索不到该企业====')
data = [com_name, social_code]
# todo:搜不到的企业需要返回到一个表格中
file.appenddata(file_name, '需处理企业', data)
return count
except:
log.info("登录失效===重新放入redis")
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
time.sleep(20)
return count
# 开始采集
try:
if spiderwork(soup, com_name, file_name):
count += 1
log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
token.updateTokeen(id_cookie,3)
return count
else:
return count
except Exception as e:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return count
def ifbeforename(company_url):
driver.get(company_url)
time.sleep(2)
com_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
except:
businessinfo = ''
if businessinfo:
try:
name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
value = businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ', '').replace('…','').replace('\n', '').replace('复制', '').split('(')[0]
except:
name = '曾用名'
value = ''
return value
else:
return ''
#解析时间
def paserTime(publishtime):
timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
current_datetime = datetime.datetime.now()
publishtime = publishtime.strip()
print(publishtime)
try:
if '年前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=365 * day)
publishtime = current_datetime - delta
elif '月前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
publishtime = current_datetime - relativedelta(months=day)
# publishtime = current_datetime - delta
elif '周前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(weeks=day)
publishtime = current_datetime - delta
elif '天前' in publishtime:
numbers = re.findall(r'\d+', publishtime)
day = int(numbers[0])
delta = datetime.timedelta(days=day)
publishtime = current_datetime - delta
elif '前天' in publishtime:
delta = datetime.timedelta(days=2)
publishtime = current_datetime - delta
elif '昨天' in publishtime:
current_datetime = datetime.datetime.now()
delta = datetime.timedelta(days=1)
publishtime = current_datetime - delta
elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
if '小时' in publishtime:
hour = publishtime.split("小时")[0]
else:
hour = 0
if hour != 0:
min = publishtime.split("小时")[1].split("分钟")[0]
else:
min = publishtime.split("分钟")[0]
delta = datetime.timedelta(hours=int(hour), minutes=int(min))
publishtime = current_datetime - delta
elif '年' in publishtime and '月' in publishtime:
time_format = '%Y年%m月%d日'
publishtime = datetime.datetime.strptime(publishtime, time_format)
elif '月' in publishtime and '日' in publishtime:
current_year = current_datetime.year
time_format = '%Y年%m月%d日'
publishtime = str(current_year) + '年' + publishtime
publishtime = datetime.datetime.strptime(publishtime, time_format)
except Exception as e:
print('时间解析异常!!')
return publishtime
@retry(tries=2,delay=3)
def getBusinessinfo(com_soup):
com_soup_ = com_soup.find('div',attrs={'data-dim':'baseInfo'})
businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW'})
if not businessinfo:
businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW '})
if not businessinfo:
raise RuntimeError('工商信息未找到')
return businessinfo
# 采集基本信息和工商信息
def spiderinfo(company_url, receptname, file_name):
qccid = company_url.split('company/')[1]
log.info(f'====={qccid}=====')
driver.get(company_url)
# req_ = s.get(headers=headers, url=company_url)
page_source_detail = driver.page_source
com_soup = BeautifulSoup(page_source_detail, 'html.parser')
#todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
try:
sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
pattern = r'\d{4}-\d{2}-\d{2}'
matched = re.findall(pattern, sourceUpdateTime_)
if matched:
sourceUpdateTime = sourceUpdateTime_
else:
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
except:
log.info(f'天眼查无该企业{social_code}')
return
try:
businessinfo = getBusinessinfo(com_soup)
except:
businessinfo = ''
if businessinfo:
data_baseinfo = baseinfo(com_soup)
# print(data_baseinfo)
tr_list = businessinfo.find_all('tr')
dic_buseniss = {}
for tr in tr_list:
# td_count = len(tr.find_all('td'))
# print(td_count)
td_list = tr.find_all('td')
td_count = len(td_list)
name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
# print(name_list)
# value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
value_list = []
for i in range(td_count):
if i % 2 != 0:
value_tag = td_list[i]
# print(value_tag)
# print("==============")
tag.deletep(value_tag, 'span', 'class', 'index_history-operate__t3kjv')
tag.deletep(value_tag, 'div', 'class', '_efcb8')
tag.deletep(value_tag, 'span', 'class', 'index_legal-bottom-info__bYvYZ')
tag.deletep(value_tag, 'a', 'class', 'ml8 link-click')
tag.deletep(value_tag, 'span', 'class', 'index_report-jump__z__UW')
tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
# for value_tag in value_tag_list:
value_list.append(value_tag.text.replace('\xa0', ''))
# print(value_list)
if len(name_list) == len(value_list):
for i in range(len(name_list)):
dic_buseniss[name_list[i]] = value_list[i]
if '曾用名' in value_list[i]:
dic_buseniss['曾用名'] = value_list[i].split('曾用名')[1].split('更多')[0]
dic_buseniss[name_list[i]] = value_list[i].split('曾用名')[0]
if name_list[i] == '法定代表人':
value_list[i] = value_list[i].split('任职')[0]
dic_buseniss[name_list[i]] = value_list[i]
try:
del dic_buseniss['天眼评分']
except:
pass
# print(dic_buseniss)
result_dict = getinfo(dic_buseniss, data_baseinfo)
# 主要针对香港台湾企业,社会信用代码传为给定的
try:
result_dict['统一社会信用代码']
except:
# log.info('未获取到统一社会信用代码')
if social_code:
result_dict['统一社会信用代码'] = social_code
else:
# 如果未给定社会信用代码,则返回
return False
if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
result_dict['企业名称'] = result_dict['企业名称'][1:-1]
if result_dict['企业名称'] == '-' and com_name:
result_dict['企业名称'] = com_name
elif not com_name:
return False
else:
pass
# print(result_dict)
# 采集成功的企业
data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
file.appenddata(file_name, '获取基本信息成功企业', data)
# 将字段转化成英文驼峰
aa_dic = dic_handle(result_dict)
aa_dic['sourceUpdateTime'] = sourceUpdateTime
aa_dic['qccId'] = qccid
log.info(aa_dic)
# sendkafka(aa_dic)
# print(aa_dic)
# post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# dic_info = json.dumps(aa_dic)
# req = requests.post(post_url, data=dic_info)
else:
# todo: 重新放入redis
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
log.error(f'未找到工商信息,重新塞入redis')
token.updateTokeen(id_cookie, 3)
# data_baseinfo = baseinfo(com_soup)
# # 主要针对香港台湾企业,社会信用代码传为给定的
# try:
# data_baseinfo['统一社会信用代码']
# except:
# log.info('未获取到统一社会信用代码')
# if social_code:
# data_baseinfo['统一社会信用代码'] = social_code
# else:
# # 如果未给定社会信用代码,则返回
# return False
# if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
# data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
# if data_baseinfo['企业名称'] == '-' and com_name:
# data_baseinfo['企业名称'] = com_name
# elif not com_name:
# return False
# else:
# pass
# # 采集成功的企业
# data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
# file.appenddata(file_name, '获取基本信息成功企业', data)
# # 将字段转化成英文驼峰
# aa_dic = dic_handle(data_baseinfo)
# aa_dic['sourceUpdateTime'] = sourceUpdateTime
# aa_dic['qccId'] = qccid
# # sendkafka(aa_dic)
# log.info(aa_dic)
# # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
# # dic_info = json.dumps(aa_dic)
# # req = requests.post(post_url, data=dic_info)
def remove_parentheses(text):
# 清除中文小括号
text = re.sub(r'(|)', '', text)
# 清除英文小括号
text = re.sub(r'\(|\)', '', text)
return text.replace(' ', '')
# 判断名称是否统一
def spiderwork(soup, receptname, file_name):
company_url = ''
try:
company_list = soup.find_all('div', class_='index_search-box__7YVh6')
except:
log.info(f'====={social_code}=====获取基本信息失败,重新放入redis=====')
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
# token.updateTokeen(id_cookie,2)
log.info('=====已重新放入redis,cookies已封号======')
return False
# receptname = '小米通讯技术有限公司'
for compamy in company_list:
info_t = compamy.find('div', class_='index_name__qEdWi')
getname = info_t.find('span').text
log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
if receptname and getname == receptname:
company_url = info_t.find('a')['href']
break
elif not receptname:
company_url = info_t.find('a')['href']
break
else:
jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
if remove_parentheses(receptname) == jian_name:
log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
company_url = info_t.find('a')['href']
break
else:
continue
if company_url:
# 采集基本信息和工商信息
spiderinfo(company_url, receptname, file_name)
else:
# 判断是否是曾用名
getname = ''
for child in company_list[0].find_all():
if child.has_attr('class'):
# print(child['class'])
if 'index_name' in child['class'][0]:
getname = child.text
company_url = child.find('a')['href']
break
# tr = company_list[:1][0]
# info_t = tr.find('div', class_='index_name__qEdWi')
# getname = info_t.find('span').text
if getname:
log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
beforename = ifbeforename(company_url)
if beforename == receptname:
spiderinfo(company_url, receptname, file_name)
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
else:
# 没有搜到相同的企业名称
data = [com_name, social_code]
file.appenddata(file_name, '需处理企业', data)
time.sleep(2)
return False
return True
def login():
# time.sleep(10)
cookies_list, id_cookie, user_name = token.get_cookies()
log.info(f'=====当前使用的是{user_name}的cookie======')
for cookie in cookies_list:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
# url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
# driver.get(url_test)
# # driver.get('https://www.qcc.com/')
time.sleep(5)
return driver,id_cookie
if __name__ == '__main__':
taskType = '基本信息/天眼查'
# driver = create_driver()
# #手动登录
# driver.get('https://www.tianyancha.com/')
#todo:绕过验证使用cookies登录
# driver, id_cookie = login()
driver = create_driver()
url = 'https://www.tianyancha.com/'
driver.get(url)
driver.maximize_window()
while True:
# todo:绕过验证使用cookies登录
driver, id_cookie = login()
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息更新.xlsx'
file.createFile(file_name)
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
company_field = '91330000742906207U|浙江我武生物科技股份有限公司'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
baseCore.sendEmail(file_name)
time.sleep(20)
file.deleteFile(file_name)
continue
if company_field == '' or company_field is None:
# 本轮结束后没有新增的企业要采集
file.deleteFile(file_name)
flag = True
while flag:
log.info('--------已没有数据---------')
time.sleep(30)
if not baseCore.check_mysql_conn(cnx_):
# 144数据库
cnx_ = baseCore.cnx
cursor_ = cnx_.cursor()
log.info('===11数据库重新连接成功===')
company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
if company_field:
flag = False
log.info("-----已添加数据------")
baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
continue
continue
# company_field_ = f'|{company_field}'
social_code = company_field.split('|')[0]
com_name = company_field.split('|')[1].replace(' ', '')
if 'ZZSN' in social_code and 'ZD' in social_code:
continue
#todo:查询天眼查id
data = baseCore.getInfomation(social_code)
if len(data) != 0:
tycid = data[11]
else:
# 数据重新塞入redis
# log.info(f'数据库中无该企业{social_code}')
sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
cursor.execute(sql)
data = cursor.fetchone()
if data:
pass
else:
# 数据库中并没有该企业 需要新增
pass
com_name_c = data[3]
xydm = data[1]
# 写入数据库
insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
cursor_.execute(insert, (com_name_c, xydm))
cnx_.commit()
tycid = ''
if tycid == None or tycid == '':
count = redaytowork(com_name, social_code, file_name)
else:
company_url = 'https://www.tianyancha.com/company/' + tycid
spiderinfo(company_url, com_name, file_name)
time.sleep(10)
break
baseCore.close()
\ No newline at end of file
......@@ -223,7 +223,9 @@ def spiderinfo(company_url, receptname, file_name):
else:
sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
except:
redaytowork(com_name, social_code, file_name)
log.info(f'天眼查无该企业{social_code}')
return
aa_dict = {
'name': receptname, # 企业名称
'shortName': None, # 企业简称
......@@ -326,7 +328,7 @@ if __name__ == '__main__':
driver.get('https://www.tianyancha.com/')
while True:
nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
file_name = f'./data/国内企业基本信息采集情况.xlsx'
file_name = f'./data/国内企业基本信息更新.xlsx'
file.createFile(file_name)
# cookies_list, id_cookie = token.get_cookies()
# cookies = {}
......@@ -336,8 +338,8 @@ if __name__ == '__main__':
# s.cookies.update(cookies)
start_time = time.time()
# 获取企业信息
# company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
company_field = '913100006073602992|光明乳业股份有限公司'
company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
# company_field = '913100006073602992|光明乳业股份有限公司'
if company_field == 'end':
# 本轮处理完毕,需要发送邮件,并且进入下一轮
......@@ -398,7 +400,7 @@ if __name__ == '__main__':
count = redaytowork(com_name, social_code, file_name)
else:
company_url = 'https://www.tianyancha.com/company/' + tycid
spiderinfo(company_url, social_code, file_name)
spiderinfo(company_url, com_name, file_name)
time.sleep(10)
# break
baseCore.close()
\ No newline at end of file
......@@ -59,7 +59,8 @@ class Token():
result = db_storage.find_one(query, sort=[('updateTime', 1)])
cookies = result['cookies']
id_token = result['_id']
return cookies, id_token
user_name = result['name']
return cookies, id_token, user_name
# 删除失效的token
def delete_token(self, cookie_):
......
"""
从es中拿到所有的标题
"""
import redis
from elasticsearch import Elasticsearch
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'researchreportdata'
def queryatt(self,index_name):
body = {
"query": {
"bool": {
"must": [
{
"nested": {
"path": "labels",
"query": {
"match": {
"labels.relationId": "91330000747735638J"
}
}
}
},
{
"range": {
"createDate": {
"gte": "2024-02-26T13:00:00",
"lte": "2024-02-27T00:00:00"
}
}
},
{
"term": {
"type.keyword": {
"value": "3"
}
}
}
]
}
},
"sort": [
{
"createDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 100
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.title',
'hits.hits._source.origin',
'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
if __name__ == '__main__':
es_method = EsMethod()
# 连接Redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
result = es_method.queryatt('researchreportdata')
total = result['hits']['total']['value']
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
log.info(f'---第1页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
origin = mms['_source']['origin']
pub_time = mms['_source']['publishDate']
try:
log.info(f'{id}--{title}--{origin}--')
item = id + "|" + title
# r.lrem(f'XJPdatabase:id_2', 0, item)
r.lpush(f'91330000747735638J:id', item)
except:
continue
......@@ -31,7 +31,7 @@ class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'),timeout=300 )
self.index_name='researchreportdata'
self.index_name='researchreportdata_2024'
'''
删除
......@@ -47,13 +47,13 @@ if __name__ == "__main__":
redis_conn = redis.Redis(connection_pool=pool)
while True:
# 从redis中读取数据,去附件表中根据title查询,更新查到的附件id
item = redis_conn.lpop('YanBao:id')
item = redis_conn.lpop('91330000747735638J:id')
if item:
log.info(item)
id = item.decode()
id = int(item.decode().split('|')[0])
try:
esMethod.delete(esMethod.index_name,id)
except:
except Exception as e:
continue
else:
log.info('已删除完毕')
......
......@@ -51,7 +51,7 @@ def parse_excel():
def get_content1():
print_result_list = []
result_dict_list = []
# query = {"专家库主键id":"1204"}
# query = {"专家库主键id":"141"}
# for db_dict in db_storage.find(query):
for db_dict in db_storage.find():
del db_dict['_id']
......
......@@ -2,8 +2,12 @@
中证智能财讯
"""
import json
import os
import sys
import time
import redis
from kafka import KafkaProducer
from obs import ObsClient
import fitz
import requests
......@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By
from selenium import webdriver
from tempfile import NamedTemporaryFile
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
......@@ -36,68 +44,14 @@ def create_driver():
@retry(tries=3, delay=1)
def getOBSres(pathType, name, response):
result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
# result = obsClient.putFile('zzsn', pathType+name, file_path=response)
result = obsClient.putFile('zzsn', pathType+name, file_path=response)
return result
def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
headers = {}
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(baseCore.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = getOBSres(pathType, now_time, name, response)
except:
log = baseCore.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = baseCore.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = baseCore.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
def zzcx():
driver = create_driver()
driver.maximize_window()
url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
# payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -115,11 +69,12 @@ def zzcx():
'Origin': 'https://zzcx.cs.com.cn',
'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
}
payload = json.dumps(payload)
result_json = requests.post(url=url, data=payload, headers=headers).json()
print(result_json)
pages = result_json['data']['pages']
for page in range(1, int(pages + 1)):
# payload = json.dumps(payload)
# result_json = requests.post(url=url, data=payload, headers=headers).json()
# print(result_json)
# pages = result_json['data']['pages']
pages = 5
for page in range(1, int(pages) + 1):
payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
payload_page = json.dumps(payload_page)
datas = requests.post(url=url, data=payload_page, headers=headers)
......@@ -127,24 +82,135 @@ def zzcx():
for news in records:
title = news['title']
news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
# news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=3ec65751b63e40d7813a0c6bbe9b3135'
try:
flag = r.sismember('IN-20240129-0001', news_url)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
# news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=2eeeb171e36b42ada02dad77b80038b1'
# 使用模拟浏览器打开
driver = create_driver()
driver.get(news_url)
div_ = driver.find_element(By.ID, 'line')
div = div_.find_element(By.XPATH, '..')
image_data = div.screenshot_as_base64
div_photo = driver.find_elements(By.ID, 'line')
for png_ in div_photo:
try:
div = png_.find_element(By.XPATH, './/div/div[1]/div')
# div = png_.find_element(By.CLASS_NAME, 'ant-col ant-col-17')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div)
time.sleep(1)
#todo:保存成临时文件
temp_file =NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
baseCore.uptoOBS()
html = driver.page_source
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
#todo:替换标签 删除标签
dele_tag = png_.find_element(By.XPATH, './/div/div[1]//div')
driver.execute_script("arguments[0].remove()", dele_tag)
#todo:将图片塞进去 新建一个new_tag
append_tag = png_.find_element(By.XPATH, './/div/div[1]')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
append_tag)
os.remove(file_path)
except:
continue
# div_undefined_line = driver.find_elements(By.ID, 'k-line-undefined')
div_undefined_line = driver.find_elements(By.ID, 'KLineSubscription')
for u_png in div_undefined_line:
div_u = u_png.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", div_u)
time.sleep(3)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
div_u.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
# todo:替换标签 删除标签
dele_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag)
# todo:将图片塞进去 新建一个new_tag
# append_tag = u_png.find_element(By.XPATH, './/div')
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
u_png)
os.remove(file_path)
div_line_bar = driver.find_elements(By.ID, 'bar-line-bar-line')
for lin_bar_tag in div_line_bar:
line_bars = lin_bar_tag.find_elements(By.XPATH, './/div[contains(@class, "ant-col-11")]')
for line_bar in line_bars:
photo_line_bar = line_bar.find_element(By.XPATH, './/div')
# todo:滚轮需要滑动
driver.execute_script("arguments[0].scrollIntoView();", photo_line_bar)
time.sleep(1)
# todo:保存成临时文件
temp_file = NamedTemporaryFile(delete=False, suffix=".png")
temp_file.close()
photo_line_bar.screenshot(temp_file.name)
file_path = temp_file.name
# todo:保存到obs链接及标签替换
name = str(baseCore.getuuid())
result = getOBSres(pathType, name, file_path)
path = result['body']['objectUrl'].split('.com')[1]
full_path = result['body']['objectUrl']
news_req = requests.get(url=news_url, headers=headers)
news_soup = BeautifulSoup(news_req.content, 'html.parser')
# todo:替换标签 删除标签
dele_tag_ = line_bar.find_element(By.XPATH, './/div')
driver.execute_script("arguments[0].remove()", dele_tag_)
# todo:将图片塞进去 新建一个new_tag
driver.execute_script(
"var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
line_bar)
# #todo:创建清晰的图片标签
# driver.execute_script(f"""
# var img = new Image();
# img.src = "http://zzsn.luyuen.com{path}"; // 替换为你的图片路径
# img.onload = function() {{
# var canvas = document.createElement("canvas");
# canvas.width = img.width;
# canvas.height = img.height;
# var ctx = canvas.getContext("2d");
# ctx.drawImage(img, 0, 0);
# document.body.appendChild(canvas);
# }}; arguments[0].insertBefore(img, arguments[0].firstChild);
# """, line_bar)
os.remove(file_path)
html = driver.page_source
news_soup = BeautifulSoup(html, 'html.parser')
detail_info = news_soup.find('div', class_='subTitle___svblj')
div_list = detail_info.find_all('div')
origin = div_list[0].text
publishDate = div_list[1].text
contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
# print(contentWithTag)
for tag in contentWithTag.find_all('span'):
if tag.text == '\ufeff':
tag.decompose()
content = contentWithTag.text
info_code = 'IN-20240129-0001'
result_dict = {
......@@ -152,25 +218,30 @@ def zzcx():
'sid': '1751787750127857666',
'title': title,
'organ': origin,
'origin': '国务院国有资产监督管理委员会',
'origin': origin,
# '摘要': zhaiyao,
'source': 16,
'content': content,
'contentWithTag': contentWithTag,
'contentWithTag': str(contentWithTag),
'publishDate': publishDate,
'sourceAddress': news_url,
}
log.info(f'{page}--{title}--{href}')
# info_list.append(result_dict)
log.info(f'{page}--{title}--{news_url}')
print(result_dict)
# break
# break
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("crawlerInfo",
json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
r.sadd(info_code + '-test', href)
r.sadd(info_code, news_url)
log.info('发送kafka成功!')
except Exception as e:
log.info(e)
finally:
producer.close()
if __name__ == "__main__":
pathType = 'PhotoDingzhi/'
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
zzcx()
\ No newline at end of file
# 中央全面深化改革委员会会议
import json
import sys
import time
import redis
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from kafka import KafkaProducer
headers = {
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
......@@ -26,22 +32,50 @@ headers = {
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
'Host': 'news.12371.cn',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
if __name__ == "__main__":
# 中央全面深化改革委员会会议
r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
# 中央全面深化改革领导小组会议
# url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
for url in url_list:
request = requests.get(url=url, headers=headers)
url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
request = requests.get(url=url, headers=header)
soup = BeautifulSoup(request.content, 'html.parser')
# print(soup)
request.encoding = request.apparent_encoding
# print(soup)
info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
ul_list = info_html.find_all('li')
for ul in ul_list:
# info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
flag = 1
for info_html in info_html_list:
if flag == 1:
info_code = 'IN-20230816-0004'
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
info_code = 'IN-20230816-0005'
ul_list = info_html.find('ul', class_='ul_list').find_all('li')
for ul in ul_list[::-1]:
publishDate_ = str(ul.find('span').text)
date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
publishDate = date_obj.strftime('%Y-%m-%d')
......@@ -51,18 +85,27 @@ if __name__ == "__main__":
newsUrl = ul.find('a')['href']
summary = ul.find('a').text
# todo: 链接判重
news_request = requests.get(url=newsUrl, headers=headers)
try:
flag = r.sismember(info_code, newsUrl)
if flag:
log.info('信息已采集入库过')
continue
except Exception as e:
continue
news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
news_soup = BeautifulSoup(news_request.content, 'html.parser')
print(news_soup)
# print(news_soup)
try:
title = news_soup.find('h1', class_='big_title').text
source = news_soup.find('div', class_='title_bottom').find('i').text
contentwithTag = news_soup.find('div', class_='word')
content = contentwithTag.text
if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
sid = '1691633319715676162'
else:
sid = '1691633869186277378'
except Exception as e:
log.error(f'解析网页出错{newsUrl}')
continue
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_info ={
'id': '1681549361661489154' + str(int(time.time()*1000)),
'title': title,
......@@ -79,6 +122,7 @@ if __name__ == "__main__":
'createDate': time_now,
}
r.sadd(info_code, newsUrl)
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
try:
kafka_result = producer.send("research_center_fourth",
......@@ -90,3 +134,4 @@ if __name__ == "__main__":
print('发送kafka异常!')
finally:
producer.close()
flag += 1
\ No newline at end of file
......@@ -174,6 +174,76 @@ def zyzsjg():
# sendKafka(data)
print(data)
def dfrwk():
datas_df = []
url_df = 'http://district.ce.cn/zt/rwk/'
req = requests.get(url=url_df, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
df_list = soup.find('div', class_='left1').find_all('div')
for df in df_list:
df_place = df.text.replace('\n', '')
try:
df_href = df.find('a')['href']
except:
df_href = ''
if df_href:
datas_df.append([df_place,df_href])
print(datas_df)
peoples = []
for data in datas_df:
place = data[0]
href = data[1]
req_df = requests.get(url=href, headers=headers)
soup_df = BeautifulSoup(req_df.content, 'html.parser')
df_list_df = soup_df.find_all('div', class_='left2')
for df in df_list_df:
try:
rwpart = df.find('div', class_='ren2')
except:
log.error(f'{place}===={href}')
continue
if rwpart:
pass
else:
continue
tr_list = rwpart.find_all('tr')
for tr in tr_list:
td_list = tr.find_all('td')
if len(td_list) == 3:
leader = td_list[1].text
try:
leader_href = td_list[1].find('a')['href']
except:
leader_href = ''
# continue
position = td_list[2].text
print(place, leader, position)
if len(td_list) == 2:
leader = td_list[0].text
try:
leader_href = td_list[0].find('a')['href']
except:
leader_href = ''
# continue
position = td_list[1].text
print(place, leader, position)
people = {
'name': leader, # 姓名
'sex': '', # 性别
'work': position, # 职务
'birthplace': '', # 出生地
'birthday': '', # 出生日期
'company': '', # 曾任单位
'city': '', # 关联城市
'school': '', # 毕业院校
'province': '', # 省或直辖市
'type': 3, # 直属类别(1:部委人物库 2:中直任务库 3:地方人物库)
'department': '', # 部门
'headSculpture': '', # 照片链接
}
# print(name)
peoples.append(people)
def gwybw_task():
# 实例化一个调度器
......@@ -200,11 +270,12 @@ def zyzsjg_task():
if __name__ == "__main__":
try:
gwybw_task()
except:
log.error('部委人物采集出错')
try:
zyzsjg_task()
except:
log.error('中直人物采集出错')
# try:
# gwybw_task()
# except:
# log.error('部委人物采集出错')
# try:
# zyzsjg_task()
# except:
# log.error('中直人物采集出错')
dfrwk()
import os
import os
......@@ -46,8 +46,28 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
if category == '.pdf':
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
pass
except:
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
......@@ -61,7 +81,6 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except:
time.sleep(3)
continue
page_size = 1
name = str(getuuid()) + category
try:
result = getOBSres(pathType, name, response)
......@@ -85,7 +104,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
#baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
......
"""
"""
港股公告-更换采用接口的方式
"""
import os
import subprocess
import sys
import uuid
import fitz
import requests
from bs4 import BeautifulSoup
import time, json
from kafka import KafkaProducer
from obs import ObsClient
from urllib.parse import unquote
from retry import retry
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor
pathType = 'QYNotice/'
cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
obsClient = ObsClient(
access_key_id='VEHN7D0TJ9316H8AHCAV', # 你的华为云的ak码
secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY', # 你的华为云的sk
server='https://obs.cn-north-1.myhuaweicloud.com' # 你的桶的地址
)
#获取文件大小
def convert_size(size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units)-1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
category = os.path.splitext(pdf_url)[1]
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
if category == '.pdf':
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
pass
except:
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except:
time.sleep(3)
continue
name = str(getuuid()) + category
try:
result = getOBSres(pathType, name, response)
except:
log.error(f'OBS发送失败')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
retData['full_path'] = unquote(result['body']['objectUrl'])
retData['file_size'] = convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
#baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def secrchATT(item_id, retData, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s '''
cursor_.execute(sel_sql, (item_id, retData['path'], type_id))
selects = cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
# selects = secrchATT(item_id, pdf_name, type_id)
#
# if selects:
# log.info(f'pdf_name:{pdf_name}已存在')
# id = ''
# return id
# else:
try:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name+category, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
cursor_.execute(Upsql, values) # 插入
cnx_.commit() # 提交
except Exception as e:
log.info(e)
log.info(f"更新完成:{item_id}===={pdf_name+category}")
try:
selects = secrchATT(item_id, retData, type_id)
except Exception as e:
log.info(e)
return ''
id = selects[0]
return id
def InsterInto(social_code, pdf_url,pub_time,pdf_name):
insert = False
# 信息插入数据库
try:
insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
list_info = [
social_code,
pdf_url,
'东方财富网',
'1',
pub_time[:10],
pdf_name
]
#144数据库
cursor.execute(insert_sql, tuple(list_info))
cnx.commit()
insert = True
return insert
except:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
return insert
def ifInstert(short_name, social_code, title, info_date):
ifexist = True
aa = info_date[:10]
sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and title = %s and publish_time = %s and origin='东方财富网' and type='1' '''
cursor.execute(sel_sql, (social_code, title, aa))
selects = cursor.fetchone()
#如果数据库中存在 则跳过
if selects:
ifexist = False
log.info(f'com_name:{short_name}、{title}, {info_date}已存在')
return ifexist
else:
return ifexist
def sendKafka(social_code,newsUrl,dic_news):
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024*1024*20)
kafka_result = producer.send("researchReportNoticeTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, newsUrl, 'Kafka操作失败')
log.info(dic_result)
return False
def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
# 上传至华为云服务器
retData = uptoOBS(pdf_url, pdf_name, 8, social_code)
# 附件插入att数据库
if retData['state']:
pass
else:
log.info(f'====pdf解析失败====')
# 获取当前进程pid
current_pid = baseCore.getPID()
# todo: 重新启动新进程,杀死当前进程
subprocess.Popen([sys.executable] + sys.argv)
os.kill(current_pid, 9)
return False
num = num + 1
att_id = tableUpdate(retData, com_name, year, pdf_name, num)
if att_id:
pass
else:
return False
content = retData['content']
lang = baseCore.detect_language(content)
if lang == 'cn':
lang = 'zh'
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
dic_news = {
'attachmentIds': att_id,
'author': '',
'content': content,
'contentWithTag': '',
'createDate': time_now,
'deleteFlag': '0',
'id': '',
'keyWords': '',
'lang': lang,
'origin': '东方财富网',
'publishDate': pub_time,
'sid': '1684032033495392257',
'sourceAddress': info_url, # 原文链接
'summary': '',
'title': pdf_name.replace('.pdf', ''),
'type': 3,
'socialCreditCode': social_code,
'year': year
}
# print(dic_news)
# 将相应字段通过kafka传输保存
try:
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
kafka_result = producer.send("researchReportNoticeTopic",
json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
'message': '操作成功',
'code': '200',
}
log.info(dic_result)
return True
except Exception as e:
dic_result = {
'success': 'false',
'message': '操作失败',
'code': '204',
'e': e
}
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
log.info(dic_result)
return False
def gonggao_info(dic_info):
code = dic_info[3]
com_name = dic_info[1]
social_code = dic_info[2]
if 'HK' in code:
pass
else:
return
#https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index=1&ann_type=H&client_source=web&stock_list=00175&f_node=0
url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index=1&ann_type=H&client_source=web&stock_list={code.split(".HK")[0]}&f_node=0'
for n1 in range(0, 3):
try:
res = requests.get(url, verify=False)
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(5)
continue
res_json = res.json()
total_hits = res_json['data']['total_hits']
for page1 in range(1,total_hits+1):
url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index={page1}&ann_type=H&client_source=web&stock_list={code.split(".HK")[0]}&f_node=0'
for n1 in range(0, 3):
try:
res = requests.get(url, verify=False)
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(5)
continue
res_json = res.json()
list_all = res_json['data']['list']
if list_all:
for one_info in list_all:
title = one_info['title']
info_date = one_info['notice_date']
year = info_date[:4]
# if page1 > 1 and '2022' in info_date:
# break_id = 1
# break
# if '2021' in info_date: # 只采集22年以后的数据
# break_id = 1
# break
try:
info_type = one_info['columns'][0]['column_name']
except:
info_type = ''
art_code = one_info['art_code']
info_url = 'https://data.eastmoney.com/notices/detail/' + code + '/' + art_code + '.html'
t = int(time.time() * 1000)
# https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code=AN202308221595478274&client_source=web&page_index=1&_=1708918810986
json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'
for n1 in range(0, 3):
try:
ip = baseCore.get_proxy()
json_2 = requests.get(json_url, proxies=ip,verify=False).json()
break
except:
if n1 == 2:
sys.exit(0)
time.sleep(60)
continue
try:
pdf_url = json_2['data']['attach_url']
except:
pdf_url = ''
try:
info_content = json_2['data']['notice_content']
except:
info_content = ''
ifexist = ifInstert(com_name, social_code, title, info_date)
# ifexist = True
if ifexist:
# 解析PDF内容,先获取PDF链接 下载 解析成功,解析失败 ,传输成功,传输失败
result = GetContent(pdf_url, info_url,title, social_code, year, info_date, start_time, com_name, num)
if result:
# 公告信息列表
log.info(f'{com_name}==============解析传输操作成功')
state = 1
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
# 发送kafka成功之后 再插入数据库
insert = InsterInto(social_code, info_url, info_date, title)
if insert:
log.info(f'===={social_code}========{title}=====插入库成功')
pass
else:
continue
else:
log.info(f'======={com_name}========{code}===已存在')
continue
if __name__ =='__main__':
#从redis中读取social_code'
list_c = []
list_all_info_1 = []
num = 0
taskType = '企业公告/东方财富网'
while True:
start_time = time.time()
# 获取企业信息
# social_code = baseCore.redicPullData('NoticeEnterprise:ggqy_socialCode_add')
social_code = '91330000747735638J'
if not social_code:
time.sleep(20)
continue
if social_code == 'None':
time.sleep(20)
continue
if social_code == '':
time.sleep(20)
continue
dic_info = baseCore.getInfomation(social_code)
# count = dic_info[15]
code = dic_info[3]
com_name = dic_info[1]
log.info(f'-----开始处理{com_name}----{social_code}------')
try:
gonggao_info(dic_info)
except:
log.info(f'-----error:{com_name}----{social_code}------')
break
import os
import os
......@@ -48,6 +48,7 @@ def getuuid():
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(pdf_url,pdf_name,type_id,social_code):
headers = {}
category = os.path.splitext(pdf_url)[1]
......@@ -56,15 +57,40 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
'create_time': '', 'page_size': '', 'content': ''}
headers['User-Agent'] = baseCore.getRandomUserAgent()
if category == '.pdf':
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
# todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
pass
except:
log.error(f'文件损坏')
return retData
else:
for i in range(0, 3):
try:
ip = baseCore.get_proxy()
response = requests.get(pdf_url, headers=headers,verify=False,proxies=ip, timeout=20)
page_size = 1
response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
if response.status_code != 200:
return retData
file_size = int(response.headers.get('Content-Length'))
retData['content'] = response.text
#todo:判断内容是否成功
if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
return retData
else:
break
except Exception as e:
time.sleep(60)
except:
time.sleep(3)
continue
name = str(getuuid()) + category
......@@ -73,12 +99,6 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except:
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
except:
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
......@@ -95,11 +115,12 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
except Exception as e:
state = 0
takeTime = baseCore.getTimeCost(start_time, time.time())
baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
#baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
......
......@@ -56,7 +56,7 @@ if __name__=="__main__":
url = "https://mp.weixin.qq.com/"
browser.get(url)
# 可改动
time.sleep(20)
time.sleep(80)
s = requests.session()
#获取到token和cookies
......
联合国:https://www.un-ilibrary.org/content/papers/27082822
联合国:https://www.un-ilibrary.org/content/papers/27082822
世界经贸组织
https://docs.wto.org/dol2fe/Pages/FE_Search/FE_S_S006.aspx?Language=English&SourcePage=FE_B_009&Context=Script&DataSource=Cat&Query=(%40Symbol%3d%22WT%2fLET*%22+AND+(%40Title%3d(modifications+OR+rectifications)+AND+schedule))&languageUIChanged=true
经合组织
https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en
国际化经营-欧盟
https://ec.europa.eu/eurostat/databrowser/explore/all/tb_eu?lang=en&display=list&sort=category
\ No newline at end of file
......@@ -272,34 +272,6 @@ class BaseCore:
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.pool_caiji = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.115.159.144',
port=3306,
user='caiji',
password='zzsn9988',
database='caiji',
charset='utf8mb4'
)
self.pool_11 = PooledDB(
creator=pymysql,
maxconnections=5,
mincached=2,
maxcached=5,
blocking=True,
host='114.116.44.11',
port=3306,
user='caiji',
password='f7s0&7qqtK',
database='clb_project',
charset='utf8mb4'
)
def check_mysql_conn(self,conn):
try:
conn.ping()
......@@ -461,16 +433,6 @@ class BaseCore:
panduan = ppp
return panduan
# # 从Redis的List中获取并移除一个元素
# def redicPullData(self,type,key):
# #1 表示国内 2 表示国外
# if type == 1:
# gn_item = self.r.lpop(key)
# return gn_item.decode() if gn_item else None
# if type == 2:
# gw_item = self.r.lpop(key)
# return gw_item.decode() if gw_item else None
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
......@@ -496,460 +458,3 @@ class BaseCore:
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
else:
pass
# 生成google模拟浏览器 必须传入值为googledriver位置信息
# headless用于决定是否为无头浏览器,初始默认为无头浏览器
# 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
# 无头浏览器用于后续对信息采集时不会有浏览器一直弹出,
def buildDriver(self, path, headless=True):
service = Service(path)
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_experimental_option(
"excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
# 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
driver = webdriver.Chrome(options=chrome_options, service=service)
# with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
# js = f.read()
#
# driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": js
# })
return driver
# 根据社会信用代码获取企业信息
def getInfomation(self, social_code):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 根据企业名称获取企业信息
def getBYnameInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE CompanyName = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 根据企业名称获取企业信息
def getBYtycidInfomation(self, com_name):
data = []
try:
sql = f"SELECT * FROM EnterpriseInfo WHERE TYCID = '{com_name}'"
# self.cursor.execute(sql)
# data = self.cursor.fetchone()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql)
data = cursor.fetchone()
conn.commit()
data = list(data)
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('=========数据库操作失败========')
return data
# 更新企业采集次数
def updateRun(self, social_code, runType, count):
try:
sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
# self.cursor.execute(sql_update)
# self.cnx.commit()
conn = self.pool_caiji.connection()
cursor = conn.cursor()
cursor.execute(sql_update)
conn.commit()
cursor.close()
conn.close()
except:
log = self.getLogger()
log.info('======更新数据库失败======')
# 保存日志入库
def recordLog(self, xydm, taskType, state, takeTime, url, e):
try:
createTime = self.getNowTime(1)
ip = self.getIP()
pid = self.getPID()
sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
# try:
# self.cursor.execute(sql, values)
# except Exception as e:
# print(e)
# self.cnx.commit()
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql, values)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存日志失败=====')
# 获取企查查token
def GetToken(self):
# 获取企查查token
query = "select token from QCC_token "
# token = '67ec7402166df1da84ae83c4b95cefc0' # 需要隔两个小时左右抓包修改
self.cursor.execute(query)
token_list = self.cursor.fetchall()
self.cnx.commit()
try:
token = token_list[random.randint(0, len(token_list) - 1)][0]
except:
token = ''
return token
# 删除失效的token
def delete_token(self, token):
deletesql = f"delete from QCC_token where token='{token}' "
self.cursor.execute(deletesql)
self.cnx.commit()
# 获取天眼查token
def GetTYCToken(self):
query = 'select token from TYC_token'
self.cursor.execute(query)
token = self.cursor.fetchone()[0]
self.cnx.commit()
return token
# 检测语言
def detect_language(self, text):
# 使用langid.py判断文本的语言
result = langid.classify(text)
if result == '':
return 'cn'
if result[0] == '':
return 'cn'
if result[0] == 'ja':
return 'jp'
if result[0] == 'fr':
return 'fra'
if result[0] == 'es':
return 'spa'
if result[0] == 'fi':
return 'fin'
if result[0] == 'vi':
return 'vie'
if result[0] == 'ko':
return 'kor'
if result[0] == 'da':
return 'dan'
return result[0]
#创建excel文件
def check_excel_file(self,file_path):
if os.path.isfile(file_path):
self.getLogger().info("Excel文件已存在")
return True
else:
self.getLogger().info("Excel文件不存在,正在创建...")
return False
# 追加接入excel
def writerToExcel(self, detailList, filename):
# filename='baidu搜索.xlsx'
# 读取已存在的xlsx文件
existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
# 创建新的数据
new_data = pd.DataFrame(data=detailList)
# 将新数据添加到现有数据的末尾
combined_data = existing_data.append(new_data, ignore_index=True)
# 将结果写入到xlsx文件
combined_data.to_excel(filename, index=False)
# return combined_data
# 对失败或者断掉的企业 重新放入redis
def rePutIntoR(self, key, item):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.r.rpush(key, item)
# 增加计数器的值并返回增加后的值
def incrSet(self, key):
# 增加计数器的值并返回增加后的值
new_value = self.r.incr(key)
print("增加后的值:", new_value)
return new_value
# 获取key剩余的过期时间
def getttl(self, key):
# 获取key的剩余过期时间
ttl = self.r.ttl(key)
print("剩余过期时间:", ttl)
# 判断key是否已过期
if ttl < 0:
# key已过期,将key的值重置为0
self.r.set(key, 0)
self.r.expire(key, 3600)
time.sleep(2)
# # 上传至文件服务器,并解析pdf的内容和页数
# def upLoadToServe(self, pdf_url, type_id, social_code):
# headers = {}
# retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
# 'full_path': '',
# 'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
# 'create_time': '', 'page_size': '', 'content': ''}
# headers['User-Agent'] = self.getRandomUserAgent()
# for i in range(0, 3):
# try:
# resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
# break
# except:
# time.sleep(3)
# continue
# page_size = 0
#
# for i in range(0, 3):
# try:
# result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
# with fitz.open(stream=resp_content, filetype='pdf') as doc:
# page_size = doc.page_count
# for page in doc.pages():
# retData['content'] += page.get_text()
# break
# except:
# time.sleep(3)
# continue
# if page_size < 1:
# # pdf解析失败
# print(f'======pdf解析失败=====')
# return retData
# else:
# time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# retData['state'] = True
# retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
# retData['full_path'] = bytes.decode(result['Remote file_id'])
# retData['file_size'] = result['Uploaded size']
# retData['create_time'] = time_now
# retData['page_size'] = page_size
#
# return retData
def deliteATT(self,id):
delitesql = f"delete from clb_sys_attachment where id = '{id}' "
self.cursor_.execute(delitesql)
self.cnx_.commit()
def secrchATT(self, item_id, year, type_id):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
self.cursor_.execute(sel_sql, (item_id, year, type_id))
selects = self.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(self, retData, com_name, year, pdf_name, num, pub_time,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
selects = self.secrchATT(item_id, year, type_id)
if selects:
self.getLogger().info(f'com_name:{com_name}--{year}已存在')
id = ''
return id
else:
Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
pub_time,origin)
self.cursor_.execute(Upsql, values) # 插入
self.cnx_.commit() # 提交
self.getLogger().info("更新完成:{}".format(Upsql))
selects = self.secrchATT(item_id, year, type_id)
id = selects[0]
return id
# 更新企业的CIK
def updateCIK(self, social_code, cik):
try:
sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
cnn = self.pool_caiji.connection()
cursor = cnn.cursor()
cursor.execute(sql)
cnn.commit()
cursor.close()
cnn.close()
except:
log = self.getLogger()
log.info('======保存企业CIK失败=====')
# 上传至obs华为云服务器,并解析破地方的内容和页数
# 获取文件大小
def convert_size(self, size_bytes):
# 定义不同单位的转换值
units = ['bytes', 'KB', 'MB', 'GB', 'TB']
i = 0
while size_bytes >= 1024 and i < len(units) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {units[i]}"
# 查看obs文件是否已经上传
def obsexist(self, file_path):
# # 文件路径
# file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
# 检查文件是否存在
response = obsClient.getObjectMetadata('zzsn', file_path)
if response.status >= 300:
self.getLogger().info('=====文件不存在obs=====')
return True
else:
self.getLogger().info(f'=====文件存在obs========{file_path}')
#uuid 根据时间戳生成 文件名 上传到obs
def getuuid(self):
get_timestamp_uuid = uuid.uuid1() # 根据 时间戳生成 uuid , 保证全球唯一
return get_timestamp_uuid
def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by,headers):
retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
'full_path': '',
'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
'create_time': '', 'page_size': '', 'content': ''}
for i in range(0, 3):
try:
response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
file_size = int(response.headers.get('Content-Length'))
break
except:
time.sleep(3)
continue
page_size = 0
name = str(self.getuuid()) + '.pdf'
now_time = time.strftime("%Y-%m")
try:
result = self.getOBSres(pathType, now_time, name, response)
except:
log = self.getLogger()
log.error(f'OBS发送失败')
return retData
try:
with fitz.open(stream=response.content, filetype='pdf') as doc:
page_size = doc.page_count
for page in doc.pages():
retData['content'] += page.get_text()
except:
log = self.getLogger()
log.error(f'文件损坏')
return retData
if page_size < 1:
# pdf解析失败
# print(f'======pdf解析失败=====')
return retData
else:
try:
time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
retData['state'] = True
retData['path'] = result['body']['objectUrl'].split('.com')[1]
retData['full_path'] = result['body']['objectUrl']
retData['file_size'] = self.convert_size(file_size)
retData['create_time'] = time_now
retData['page_size'] = page_size
except Exception as e:
state = 0
takeTime = self.getTimeCost(start_time, time.time())
self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
return retData
return retData
@retry(tries=3, delay=1)
def getOBSres(self, pathType, now_time, name, response):
result = obsClient.putContent('zzsn', pathType + name, content=response.content)
# resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
return result
def sendEmail(self, file_name):
file = open(file_name, 'rb').read()
# 发送邮箱地址
sender = '1195236739@qq.com'
# 接收邮箱地址
receiver = 'fujunxue@ciglobal.cn'
smtpserver = 'smtp.qq.com'
# 发送邮箱登录 账户 密码
username = '1195236739@qq.com'
password = 'gatvszshadvpgjci'
maile_title = '企业基本信息采集情况'
message = MIMEMultipart()
message['From'] = sender
message['To'] = receiver
message['Subject'] = Header(maile_title, 'utf-8')
message.attach(MIMEText('企业基本信息采集情况', 'plain', 'utf-8'))
xlsxApart = MIMEApplication(file)
xlsxApart.add_header('Content-Disposition', 'attachment', filename='企业基本信息采集情况.xlsx')
message.attach(xlsxApart)
smtpObj = smtplib.SMTP_SSL(smtpserver) # 注意:如果遇到发送失败的情况(提示远程主机拒接连接),这里要使用SMTP_SSL方法
smtpObj.connect(smtpserver, port=465)
smtpObj.login(username, password)
smtpObj.sendmail(sender, receiver, message.as_string())
print("邮件发送成功!!!")
smtpObj.quit()
"""
国外智库-欧盟 经合组织
"""
import json
import time
import pymongo
from bs4 import BeautifulSoup
import requests
from datetime import datetime
from kafka import KafkaProducer
from retry import retry
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
'国外智库']
@retry(tries=2, delay=5)
def sendKafka(dic):
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(dic, ensure_ascii=False).encode('utf8'))
log.info(f'{dic["sourceAddress"]}传输成功')
def secrchATT(item_id, retData, type_id, order_by):
sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
selects = baseCore.cursor_.fetchone()
return selects
# 插入到att表 返回附件id
def tableUpdate(retData, file_name, num, publishDate,origin):
item_id = retData['item_id']
type_id = retData['type_id']
group_name = retData['group_name']
path = retData['path']
full_path = retData['full_path']
category = retData['category']
file_size = retData['file_size']
status = retData['status']
create_by = retData['create_by']
page_size = retData['page_size']
create_time = retData['create_time']
order_by = num
object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]
Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
values = (
file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
status, create_by,
create_time, object_key, 'zzsn', publishDate,origin)
baseCore.cursor_.execute(Upsql, values) # 插入
baseCore.cnx_.commit() # 提交
baseCore.getLogger().info("更新完成:{}".format(Upsql))
selects = secrchATT(item_id, retData, type_id, order_by)
id = selects[0]
return id
def save_data(dic_news):
aaa_dic = {
'附件id': dic_news['attachmentIds'],
'网址': dic_news['sourceAddress'],
'tid': '',
'来源': f"经济合作与发展组织",
'创建时间': dic_news['createDate'],
'带标签内容': dic_news['contentWithTag'][:100],
'发布时间': dic_news['publishDate'],
'标题': dic_news['title']
}
db_storage.insert_one(aaa_dic)
@retry(tries=2, delay=5)
def translate(title, contentWithTag):
headers = {
'Content-Type': 'application/json',
}
dic_info = {
'title': title,
# 'summary': '<div>apple</div>',
'contentWithTag': contentWithTag
}
dic_info = json.dumps(dic_info)
req = requests.post('http://117.78.23.14:5001/translate', data=dic_info, headers=headers)
dataJson = req.json()
if dataJson['status'] == 'failed':
raise
titleRaw = dataJson['title']
contentWithTagRaw = dataJson['contentWithTag']
titleRaw = BeautifulSoup(titleRaw,'html.parser')
titleRaw = titleRaw.text
contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
return titleRaw, contentWithTagRaw
def doJob():
num = 1
url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
req = requests.get(url=url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
div_list = div_part.find_all('div', class_='row panel')
for div in div_list:
start_time = time.time()
title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
is_href = db_storage.find_one({'网址': href})
if is_href:
log.info(f'{href}===已采集')
continue
pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
# 定义原始时间的格式
time_format = "%d %b %Y"
# 转换为标准时间
standard_time = datetime.strptime(pubtime_, time_format).strftime("%Y-%m-%d")
if standard_time > '2023-01-30':
pass
else:
break
year = standard_time[:4]
pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
req_news = requests.get(url=href, headers=headers)
soup_news = BeautifulSoup(req_news.content, 'html.parser')
# print(title, standard_time, pdf_url, href)
contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
content = contentWithTag.get_text()
# todo:翻译
try:
titleRaw, contentWithTagRaw = translate(str(title), str(contentWithTag))
log.info(f'{href}===翻译成功')
except Exception as e:
log.error(f'{href}===翻译失败==={e}')
continue
retData = baseCore.uptoOBS(pdf_url, title, 15, '', pathType, taskType, start_time, create_by)
num += 1
id_list = []
if retData['state']:
att_id = tableUpdate(retData, title, num, standard_time, '经济合作与发展组织')
if att_id:
id_list.append(att_id)
now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
lang = baseCore.detect_language(content)
contentRaw = contentWithTagRaw.text
contentWithTagRaw = str(contentWithTagRaw)
dic = {
'id': f'1620244462491893761{int(time.time())}',
'subjectId': '1620244462491893761',
'checkStatus': 1,
'deleteFlag': 0,
'topNum': 0,
'content': content,
'contentRaw': contentRaw,
'contentWithTag': str(contentWithTag),
'contentWithTagRaw': contentWithTagRaw,
'createDate': now,
'labels': [
{'labelMark': 'organization', 'relationId': '1619903523269271554', 'relationName': '经济合作与发展组织'}],
'lang': lang,
'origin': '经济合作与发展组织',
'publishDate': standard_time,
'sourceAddress': href,
'title': title,
'titleRaw': titleRaw,
'updateDate': now,
'attachmentIds':id_list
}
sendKafka(dic)
try:
save_data(dic)
except:
log.error(f'{href}===数据库保存失败')
# break
if __name__ == "__main__":
pathType = 'PolicyDocuments/'
taskType = '国外智库-经合组织'
create_by = 'XueLingKun'
doJob()
......@@ -119,17 +119,17 @@ if __name__=='__main__':
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
# if '(' in author or '本刊' in author \
# or '记者' in author or '新闻社' in author \
# or '”' in author\
# or '大学' in author or '洛桑江村' in author:
# continue
if '国资委党委' in author:
pass
else:
if '(' in author or '本刊' in author \
or '记者' in author or '新闻社' in author \
or '”' in author\
or '大学' in author or '洛桑江村' in author:
continue
# if '国资委党委' in author:
# pass
# else:
# continue
new_href = new.find('a')['href']
is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
if is_member:
continue
new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
......@@ -165,7 +165,7 @@ if __name__=='__main__':
}
log.info(dic_news)
if sendKafka(dic_news):
r.sadd('qiushileaderspeech_two::' + period_title, new_href)
r.sadd('qiushileaderspeech::' + period_title, new_href)
log.info(f'采集成功----{dic_news["sourceAddress"]}')
import csv
import time
import pandas as pd
import redis
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.common import StaleElementReferenceException
from base import BaseCore
from requests.packages import urllib3
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urllib3.disable_warnings()
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
......@@ -33,54 +40,100 @@ headers = {
'sec-ch-ua-platform': '"Windows"',
}
# todo:使用模拟浏览器
def create_driver():
path = r'D:\soft\msedgedriver.exe'
@retry(tries=2, delay=5)
def getHref(Keywords):
data = {
'Menu': 'law',
'Keywords': Keywords,
'PreKeywords': Keywords,
'SearchKeywordType': 'Title',
'MatchType': 'Exact',
'RangeType': 'Piece',
'Library': 'chl',
'ClassFlag': 'chl',
'GroupLibraries': '',
'QuerySearchCondition': 'Title+Exact+Piece+0',
'QueryOnClick': False,
'AfterSearch': True,
'RequestFrom': 'btnSearch',
'SearchInResult': '',
'PreviousLib': 'chl',
'IsSynonymSearch': 'false',
'RecordShowType': 'List',
'ClassCodeKey': ',,,,,,',
'IsSearchErrorKeyword': '',
'FirstQueryKeywords': Keywords,
'FirstQueryKeywordType': 'Title',
'IsSynonymSearch': 'false',
'X-Requested-With': 'XMLHttpRequest',
# options = webdriver.EdgeOptions()
options = {
"browserName": "MicrosoftEdge",
"ms:edgeOptions": {
"extensions": [], "args": ["--start-maximized"] # 添加最大化窗口运作参数
}
ip = baseCore.get_proxy()
url = 'https://sclx.pkulaw.com/law/chl'
req = requests.get(url, headers=headers, data=data, proxies=ip, verify=False)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
}
driver = webdriver.Edge(executable_path=path, capabilities=options)
return driver
@retry(tries=2, delay=5)
def getHref(Keywords, driver):
# data = {
# 'Menu': 'law',
# 'Keywords': Keywords,
# 'PreKeywords': Keywords,
# 'SearchKeywordType': 'Title',
# 'MatchType': 'Exact',
# 'RangeType': 'Piece',
# 'Library': 'chl',
# 'ClassFlag': 'chl',
# 'GroupLibraries': '',
# 'QuerySearchCondition': 'Title+Exact+Piece+0',
# 'QueryOnClick': False,
# 'AfterSearch': True,
# 'RequestFrom': 'btnSearch',
# 'SearchInResult': '',
# 'PreviousLib': 'chl',
# 'IsSynonymSearch': 'false',
# 'RecordShowType': 'List',
# 'ClassCodeKey': ',,,,,,',
# 'IsSearchErrorKeyword': '',
# 'FirstQueryKeywords': Keywords,
# 'FirstQueryKeywordType': 'Title',
# 'IsSynonymSearch': 'false',
# 'X-Requested-With': 'XMLHttpRequest',
# }
driver.get('https://sclx.pkulaw.com/law')
# ip = baseCore.get_proxy()
driver.find_element(By.ID, 'txtSearch').send_keys(Keywords)
time.sleep(0.5)
driver.find_element(By.CLASS_NAME, 'btn-search').click()
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "accompanying-wrap")))
getpart = driver.find_element(By.CLASS_NAME, 'accompanying-wrap')
# li_list = getpart.find_elements(By.TAG_NAME, 'li')
# for li in li_list:
driver.execute_script("arguments[0].scrollIntoView();", getpart)
time.sleep(2)
try:
tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
'name': 'HistoryAssociation'})
href = 'https://sclx.pkulaw.com' + tag.get('url')
except:
href = ''
element = getpart.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
time.sleep(1)
driver.execute_script("arguments[0].scrollIntoView();", element)
time.sleep(1)
element.click()
href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-tab-col")))
info_part = driver.find_element(By.CLASS_NAME, 'a-tab-col').find_element(By.XPATH, './/div[@name="HistoryAssociation"]')
# except Exception as e:
except StaleElementReferenceException:
# 元素已经stale,重新定位元素
element = driver.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
element.click() # 再次尝试与元素交互
href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
# log.info(e)
# href = ''
return href
# url = 'https://sclx.pkulaw.com/law/chl'
# req = requests.post(url, headers=headers, data=data, proxies=ip)
# req = requests.post(url, headers=headers, data=data, verify=False)
# req.encoding = req.apparent_encoding
# soup = BeautifulSoup(req.text, 'html.parser')
# try:
# tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
# 'name': 'HistoryAssociation'})
# href = 'https://sclx.pkulaw.com' + tag.get('url')
# except:
# href = ''
# return href
@retry(tries=2, delay=5)
def getData(href):
data = []
ip = baseCore.get_proxy()
req = requests.get(href, headers=headers, proxies=ip, verify=False)
@retry(tries=3, delay=5)
def getData(href, Keywords):
term = Keywords
# ip = baseCore.get_proxy()
# req = requests.get(href, headers=headers, proxies=ip)
req = requests.get(href, headers=headers)
req.encoding = req.apparent_encoding
soup = BeautifulSoup(req.text, 'html.parser')
li_list = soup.find_all('li')
......@@ -90,22 +143,59 @@ def getData(href):
theme = li.find('div', class_='theme').text.strip()
except:
theme = ''
try:
relevance = li.find('div', class_='relevance').text.strip()
except:
relevance = ''
data.append([publishDate,theme,relevance])
time.sleep(1)
return data
# try:
# relevance = li.find('div', class_='relevance').text.strip()
# except:
# relevance = ''
# log.info(f'{publishDate}==={theme}==')
term += ',' + theme + '_' + publishDate
log.info(term)
if ',' not in term or '_' not in term:
r.rpush('ShenjisclxError:', Keywords)
return None
return term
def doJob():
data = []
Keywords = '中华人民共和国公司法(2023修订)'
href = getHref(Keywords)
data_list = []
driver = create_driver()
driver.maximize_window()
while True:
try:
Keywords = r.lpop('Shenjisclx:').decode()
# Keywords = '中华人民共和国银行业监督管理法(2006修正)'
except:
Keywords = ''
if Keywords:
try:
href = getHref(Keywords, driver)
if href:
data += getData(href)
df = pd.DataFrame(data)
print(df)
r.rpush('ShenjisclxHref:', f'{Keywords}|{href}')
log.info(f'{Keywords}====找到=== {href}')
term = getData(href, Keywords)
else:
term = Keywords + ','
r.rpush('ShenjisclxHrefNull:', f'{Keywords}|{href}')
log.info(f'{Keywords}====未找到')
if term:
# data_list.append(term)
r.rpush('ShenjisclxReault:', term)
except:
r.rpush('ShenjisclxError:', Keywords)
continue
time.sleep(2)
else:
break
# print(data_list)
# with open('./output.csv', 'w', newline='') as file:
# writer = csv.writer(file)
#
# # 写入数据
# for row in data_list:
# writer.writerow(row.split(','))
#
# print('数据已成功写入CSV文件')
if __name__ == '__main__':
doJob()
......
import csv
# 要写入的数据
# data = [
# ['Name', 'Age', 'City'],
# ['Alice', 25, 'New York'],
# ['Bob', 30, 'Los Angeles'],
# ['Charlie', 35, 'Chicago']
# ]
data = ['aaaa,bbbb,cccc', 'aaaa,cccc,ffff']
# 打开CSV文件进行写入
with open('./output.csv', 'w', newline='') as file:
writer = csv.writer(file)
# 写入数据
for row in data:
writer.writerow(row.split(','))
print('数据已成功写入CSV文件')
import csv
import redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
if __name__ == "__main__":
with open('./output0220_1.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
while True:
try:
term_ = r.lpop('ShenjisclxError:').decode()
term = str(term_) + ','
except:
term = ''
if term == '':
break
else:
# 写入数据
writer.writerow(str(term).split(','))
......@@ -170,5 +170,71 @@ for data in datas:
# f.write(dic_info_)
# break
# req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
log.info(req.text)
# import re, datetime
#
#
# def paserTime(publishtime):
# timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
# current_datetime = datetime.datetime.now()
# publishtime = publishtime.strip()
# print(publishtime)
#
# try:
# if '年前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=365 * day)
# publishtime = current_datetime - delta
# elif '月前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(months=day)
# publishtime = current_datetime - delta
# elif '周前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(weeks=day)
# publishtime = current_datetime - delta
# elif '天前' in publishtime:
# numbers = re.findall(r'\d+', publishtime)
# day = int(numbers[0])
# delta = datetime.timedelta(days=day)
# publishtime = current_datetime - delta
# elif '前天' in publishtime:
# delta = datetime.timedelta(days=2)
# publishtime = current_datetime - delta
# elif '昨天' in publishtime:
# current_datetime = datetime.datetime.now()
# delta = datetime.timedelta(days=1)
# publishtime = current_datetime - delta
# elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
# if '小时' in publishtime:
# hour = publishtime.split("小时")[0]
# else:
# hour = 0
# if hour != 0:
# min = publishtime.split("小时")[1].split("分钟")[0]
# else:
# min = publishtime.split("分钟")[0]
#
# delta = datetime.timedelta(hours=int(hour), minutes=int(min))
# publishtime = current_datetime - delta
# elif '年' in publishtime and '月' in publishtime:
# time_format = '%Y年%m月%d日'
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# elif '月' in publishtime and '日' in publishtime:
# current_year = current_datetime.year
# time_format = '%Y年%m月%d日'
# publishtime = str(current_year) + '年' + publishtime
# publishtime = datetime.datetime.strptime(publishtime, time_format)
# except Exception as e:
# print('时间解析异常!!')
# return publishtime
#
# if __name__ == "__main__":
# publishtime_ = '1小时17分钟前'
# publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
# print(publish_time)
\ No newline at end of file
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables):
try:
del post_dict['is_repeat']
del post_dict['tags']
del post_dict['title_pd']
# 发送kafka
producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
kafka_result = producer.send("research_center_fourth",
json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
print(kafka_result.get(timeout=10))
# producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
# kafka_result = producer.send("research_center_fourth",
# json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
#
# print(kafka_result.get(timeout=10))
dic_result = {
'success': 'ture',
......@@ -122,21 +123,22 @@ def get_content():
except:
print('请求错误1')
continue
for data_dict in data_list[::-1]:
# for data_dict in data_list[::-1]:
for data_dict in data_list[:1]:
article_id = data_dict['article_id']
print(type(article_id))
is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
if is_article_id:
continue
title = data_dict['title']
# is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
# if is_article_id:
# continue
title = data_dict['title'] # 采集到的标题
pub_time = data_dict['input_date']
current_date = datetime.now()
yesterday = current_date - timedelta(days=1)
# 格式化日期
yesterday_date = yesterday.strftime("%Y-%m-%d")
if pub_time <= yesterday_date:
continue
title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
# if pub_time <= yesterday_date:
# continue
title_dict_list = db_storage.find({'title_pd': title.replace(' ', ''), 'is_repeat': ''}) # 如果找到一样的标题 判断三天之内是否有重复的
is_repeat = ''
for title_dict in title_dict_list:
pub_time1 = title_dict['publishDate']
......@@ -152,6 +154,14 @@ def get_content():
doc_href = pq(href_text)
content_html1 = str(doc_href('.d2txt_con.clearfix'))
content_html2 = str(doc_href('.editor.clearfix'))
#rtodo: 找到标题并拼接
title1 = doc_href('.d2txt.clearfix h2').text()
title2 = doc_href('.d2txt.clearfix h1').text()
title3 = doc_href('.d2txt.clearfix h3').text()
if title1 == '' and title3 == '':
title_final = title
else:
title_final = title1 + ' ' + title2 + ' ' + title3
except:
print('请求错误2')
continue
......@@ -170,7 +180,8 @@ def get_content():
origin = data_dict['origin_name']
a_dict = {
'id': "1534423014825668610" + article_id,
'title': title,
'title': title_final,
'title_pd': title,
'author': '',
'origin': origin,
'contentWithTag': content_html,
......@@ -183,6 +194,7 @@ def get_content():
}
art_content_dict[article_id] = a_dict
db_a_dict = a_dict.copy()
db_a_dict['title_pd'] = title.replace(' ', '')
db_storage.insert_one(db_a_dict)
if is_repeat == '':
print(href)
......
"""
"""
从es中拿到所有的标题
"""
import redis
from elasticsearch import Elasticsearch
from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
class EsMethod(object):
def __init__(self):
# 创建Elasticsearch对象,并提供账号信息
self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
self.index_name = 'subjectdatabase'
def queryatt(self,index_name,pnum):
body = {
"query": {
"match": {
"subjectId": "1534423014825668610"
}
},
"sort": [
{
"publishDate": {
"order": "desc"
}
}
],
"track_total_hits": True,
"size": 200,
"from": pnum
}
filter_path = ['hits.hits._id',
'hits.total.value',
'hits.hits._source.title',
'hits.hits._source.origin',
'hits.hits._source.publishDate',
] # 字段2
result = self.es.search(index=index_name
, doc_type='_doc'
, filter_path=filter_path
, body=body)
# log.info(result)
return result
if __name__ == '__main__':
es_method = EsMethod()
# 连接Redis
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
for i in range(56):
result = es_method.queryatt('subjectdatabase', i*200)
total = result['hits']['total']['value']
try:
msglist = result['hits']['hits']
except:
log.info(f'error-----{result}')
continue
log.info(f'---第{i}页{len(msglist)}条数据----共{total}条数据----')
for mms in msglist:
id = mms['_id']
title = mms['_source']['title']
origin = mms['_source']['origin']
pub_time = mms['_source']['publishDate']
try:
log.info(f'{id}--{title}--{origin}--')
item = id + "|" + pub_time
# r.lrem(f'XJPdatabase:id_2', 0, item)
r.lpush(f'XJPdatabase:id', item)
except:
continue
"""
"""
对标题进行操作
1.有空格的去掉空格
2.精确去重
3.杰卡德相似度去重
"""
#将数据读到csv中
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_excel('./test2.xlsx')
print(df)
# 去掉空格
df['title_1'] = df['title'].str.replace(' ', '')
print(df['title_1'])
#精确去重
# df_drop = df.drop_duplicates(subset=['title'], keep='first')
# duplicates = df[df.duplicated('title_1', keep=False)]['title_1']
#杰卡德相似度去重
# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),max_features=1000)
# tfidf_matrix = vectorizer.fit_transform(df['title'])
#
# dist = 1 - cosine_similarity(tfidf_matrix)
#
# df['similar'] = dist.mean(axis=1)
#
# df_drop = df.drop_duplicates(subset=['title'],keep='last')
# df_drop.to_csv('D:/data/titles_drop.csv',index=False)
\ No newline at end of file
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -163,9 +163,8 @@ class BaiduTaskJob(object):
return kwList
def runSpider(self,kwmsg):
searchkw=kwmsg['kw']
def runSpider(self,kwmsg,com_name):
searchkw=com_name + kwmsg['kw']
wordsCode=kwmsg['wordsCode']
sid=kwmsg['sid']
baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
......@@ -186,7 +185,15 @@ class BaiduTaskJob(object):
finally:
baiduSpider.driver.quit()
logger.info("关键词采集结束!"+searchkw)
import random
def get_comname(self):
# todo:读取redis里的企业名称添加到关键词上
com_name = baseCore.redicPullData('SousuoBaidu:companyname')
if com_name:
return com_name
else:
logger.info('====已无企业===')
return None
def PutWords(codeList, r):
......@@ -208,50 +215,17 @@ if __name__ == '__main__':
baseCore=BaseCore()
logger=baseCore.getLogger()
# ss='(中国机床工具工业协会|中国内燃机工业协会|中国机电工业价格协会|中国机械电子兵器船舶工业档案学会|中国仪器仪表行业协会|中国工程机械工业协会|中国文化办公设备制造行业协会|中国机械工业金属切削刀具技术协会|中国机械工业教育协会|中国汽车工业协会|中国机械通用零部件工业协会|中国环保机械行业协会|中国模具工业协会|中国机械工业勘察设计协会|中国机械制造工艺协会|中国机械工业审计学会|中国轴承工业协会|中国机电一体化技术应用协会|中国机械工程学会|中国液压气动密封件工业协会|中国铸造协会|中国通用机械工业协会|中国锻压协会|中国制冷空调工业协会|中国热处理行业协会|中国电工技术学会|中国仪器仪表学会|中国石油和石油化工设备工业协会|中国表面工程协会|中国食品和包装机械工业协会|中国焊接协会|中国汽车工程学会|中国塑料机械工业协会|中国机械工业企业管理协会|中国印刷及设备器材工业协会|中国机械工业质量管理协会|中国电器工业协会|中国机械工业安全卫生协会|中国重型机械工业协会|中国机械工业标准化技术协会|中国机械工业职工思想政治工作研究会|中国农业机械工业协会|中国机电装备维修与改造技术协会 |机械工业信息研究院|机械工业教育发展中心|机械工业经济管理研究院|机械工业信息中心|机械工业人才开发服务中心|机械工业北京电工技术经济研究所|机械工业技术发展基金会|机械工业哈尔滨焊接技术培训中心|机械工业仪器仪表综合技术经济研究所)+(私收会费|私吞|肆意牟利|损失浪费|索贿|贪财|贪官污吏|贪污|违背组织原则|违法|违纪|为官不廉|为政擅权|窝案|舞弊|泄露国家机密|信鬼神|性关系|虚假信息|虚假招标|隐瞒不报|隐瞒真相|营私|鬻爵|主动投案|资产流失|钻空子|钻漏洞|被调查|被双开|不担当|不老实|不良影响|不正当|不作为|超标准建设|超标准装修|吃空饷|吃拿卡要|渎职|对党不忠诚|非法批地|腐败|腐虫|腐化堕落|公车私用|公费开销|公款吃喝|公款出境|公款旅游|勾结|官迷心窍|好色|回扣|贿赂|挤占挪用|纪律审查|监察调查|监守自盗|践踏法律|接受审查调查|截留克扣|开除党籍|开除公职|抗议|利欲熏心|敛财|乱摊派|乱作为|落马|落网|买官|买卖审批权限|卖官|谋取暴利|谋取私利|目无法纪|幕后交易|弄虚作假|挪用公款|骗取|钱色交易|潜规则|侵害权益|侵吞公款|侵占挪用|圈子文化|权利扭曲|权钱交易|权色交易|山头主义|涉案|生活糜烂|生活奢靡|失察|失管|收送|受贿|双规|双开|私分|私人会所|私设小金库|负面|下降|违规|不利|亏损|上诉|不法|不良名单|停职|公开谴责|公诉|内幕交易|刑事拘留|刑事责任|刑拘|判决|判刑|判赔|司法处置|合同纠纷|处分|处罚|强制执行|仲裁|伪造|伪造公章|投案|投诉|拘留|接受调查|控诉|查封|涉嫌|涉诉监察调查|纠纷|经营异常名录|缉捕|罚单|罚款|罚金|罪犯|自首|获刑|行贿|警示函|贪腐|违约金|追究刑责|造假|逮捕|非法|非法集资判决书|申诉|纠纷|通报|开除|留党察看|追债|逃债|资产负债率|情色交易|搞权钱|曝光|黑料|重罚|虚假报告|侵犯)'
# keymsglist=baiduTaskJob.getkeywords(ss)
# print(keymsglist)
# 创建Redis连接
# r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
# codeList = [
# 'KW-20220809-0004',
# 'KW-20220524-0004',
# 'KW-20220809-0005',
# 'KW-20220824-0001',
# 'KW-20220809-0002',
# 'KW-20220809-0003',
# 'KW-20220826-0001',
# 'KW-20220602-0003',
# 'KW-20220602-0002',
# 'KW-20220113-0007',
# 'KW-20220113-0006',
# 'KW-20220108-0004',
# 'KW-20220113-0004'
# ]
# PutWords(codeList, r)
while True:
try:
# codeid = redicPullData("BaiduSearch:WordsCode", r)
# if codeid:
# pass
# else:
# PutWords(codeList, r)
# #codeList.append('KW-20220108-0004')
# logger.info(f'开始采集{codeid}')
com_name = baiduTaskJob.get_comname()
if com_name:
pass
else:
break
codeList = [
# 'KW-20220809-0004',
# 'KW-20220524-0004',
# 'KW-20220809-0005',
# 'KW-20220824-0001',
# 'KW-20220809-0002',
# 'KW-20220809-0003',
'KW-20220826-0001',
# 'KW-20220602-0003',
# 'KW-20220602-0002',
# 'KW-20220113-0007',
# 'KW-20220113-0006',
# 'KW-20220108-0004',
# 'KW-20220113-0004'
'KW-20240206-0001',
'KW-20240206-0002',
'KW-20240206-0003'
]
for codeid in codeList:
try:
......@@ -271,7 +245,7 @@ if __name__ == '__main__':
# 创建一个线程池,指定线程数量为4
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
# 提交任务给线程池,每个任务处理一个数据
results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
results = [executor.submit(baiduTaskJob.runSpider, data,com_name) for data in kwList]
# 获取任务的执行结果
for future in concurrent.futures.as_completed(results):
try:
......
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
......@@ -7,6 +7,7 @@ import logbook
import logbook.more
# 核心工具包
import pymysql
import redis
from tqdm import tqdm
# 注意 程序退出前 调用BaseCore.close() 关闭相关资源
class BaseCore:
......@@ -215,6 +216,8 @@ class BaseCore:
except :
pass
def __init__(self):
# 连接到Redis
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
charset='utf8mb4')
self.__cursor_proxy= self.__cnx_proxy.cursor()
......@@ -288,65 +291,11 @@ class BaseCore:
def getRandomUserAgent(self):
return random.choice(self.__USER_AGENT_LIST)
# 获取代理
def get_proxy(self):
sql = "select proxy from clb_proxy"
self.__cursor_proxy.execute(sql)
proxy_lists = self.__cursor_proxy.fetchall()
self.__cnx_proxy.commit()
ip_list = []
for proxy_ in proxy_lists:
ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
proxy_list = []
for str_ip in ip_list:
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"http": proxyMeta,
"https": proxyMeta
}
proxy_list.append(proxy)
return proxy_list[random.randint(0, 3)]
def get_proxy(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxyMeta = "http://%(host)s:%(port)s" % {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
proxy = {
"HTTP": proxyMeta,
"HTTPS": proxyMeta
}
ip_list.append(proxy)
return ip_list
def get_proxyIPPort(self):
ip_list = []
with self.__cursor_proxy as cursor:
sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
print(sql_str)
cursor.execute(sql_str)
rows = cursor.fetchall()
for row in tqdm(rows):
str_ip = row[0]
str_ip_list = str_ip.split('-')
proxy = {
"host": str_ip_list[0],
"port": str_ip_list[1],
}
ip_list.append(proxy)
return ip_list
\ No newline at end of file
# 从Redis的List中获取并移除一个元素
def redicPullData(self, key):
try:
self.r.ping()
except:
self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
item = self.r.lpop(key)
return item.decode() if item else None
\ No newline at end of file
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论