Merge remote-tracking branch 'origin/master'

# Conflicts: # shenji/sclx.py

Merge remote-tracking branch 'origin/master'
# Conflicts: # shenji/sclx.py
6e37a78a · LiuLiYuan · 422c5516 · b52e4502 · 6e37a78a · 6e37a78a
--- a/base/研究中心需更新企业.py
+++ b/base/研究中心需更新企业.py
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.schedulers.blocking import BlockingScheduler
+import pandas as pd
+import redis
+def putCom():
+    com_list = ['91210000558190456G', '914200001000115161', '911100007109310534', '9111000071093123XX',
+                '91110000100017643K', '91110000100018267J', '91110000MA01P657XY', '91230100127057741M',
+                '91440300190346175T', 'ZZSN22083000000003', '91110000400000720M', '911100001055722912',
+                '91110000100005220B', '911100001000094165', '91310000132200821H', '911100001000128855',
+                '91110000710924910P', '91110000710924929L', '911100007109225442', '9111000071092649XU',
+                '91310000MA1FL70B67', '911100007109311097', '912201011239989159', '911100007178306183',
+                '91310000MA7ALG04XG', '91110000100017707H', '91110000710929498G', '91110000100010249W',
+                '9151000062160427XG', '91310000MA1FL4B24G', '91110000400001889L', '9144030010001694XX',
+                '91110000100000825Q', '91110000100006194G', '91110000717828315T', '91110000100001043E',
+                '91110000MA005UCQ5P', '91110000710935732K', '91110000710930392Y', '91110000710930296M',
+                '911100007109303176', '91110000710925243K', '91110000100014071Q', '91110000100009563N',
+                '9111000071093107XN', '9111000010001002XD', '91110000100001852R', '91110000100001625L',
+                '911100001000080343', '91110000400008060U', '91110000101699383Q', '91110000100000489L',
+                '9111000071092868XL', '91110000100001035K', '911100004000011410', '91110000710933809D',
+                '91110000100010310K', '91133100MABRLCFR5Q', '91110000MA001HYK9X', '911100001000016682',
+                '911100007109279199', '12100000400010275N', '91110000710935636A', '91110000100024800K',
+                '9144000076384341X8', '91440000100005896P', '91110000MA01W8B394', '91110000717830650E',
+                '91110000100003057A', 'ZZSN22061600000001', '91310000MA1FL0LX06', '9111000010169286X1',
+                '91110000100010433L', '91110000100010660R', '91110000102016548J', '91110000100001676W',
+                '9111000071092200XY', '91133100MA0G9YKT8B', '9111000010000093XR', '91110000100006485K',
+                '91360702MA7FK4MR44', '91420100MA4L0GG411', '91110000101625149Q', '12100000400006022G',
+                '912302001285125661', '91110000100005888C', '911100007109250324', '91110000100024915R',
+                '9111000040000094XW', '91310000MA1FL1MMXL', '91110000100015058K', '91110000710929930X',
+                '91133100MA0GBL5F38', '9111000010000085X6', '91110000101100414N']
+
+    df = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx')
+    # 连接到Redis数据库
+    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+    for i in range(len(df)):
+        social_code = df['social_code'][i]
+        com_name = df['name'][i]
+        # print(social_code)
+        if social_code in com_list:
+            pass
+        else:
+            if 'ZZSN' in social_code or 'ZD' in social_code:
+                continue
+            else:
+                item = social_code + '|' + com_name
+                r.rpush('UpdateBasdeInfo:SocialCode_CompanyName', item)
+
+def putCom_task():
+    # 实例化一个调度器
+    scheduler = BlockingScheduler()
+    # 每个月执行一次
+    scheduler.add_job(putCom, 'cron', day=1, hour=0, minute=0)
+    try:
+        # redisPushData  # 定时开始前执行一次
+        # putCom()
+        scheduler.start()
+    except Exception as e:
+        print('定时采集异常', e)
+        pass
+
+if  __name__ == '__main__':
+    putCom_task()
\ No newline at end of file
--- a/comData/BaseInfo_qcc/test.py
+++ b/comData/BaseInfo_qcc/test.py
+import pandas as pd
+# from pandas import DataFrame as df
+import pymysql
 import redis

-# 连接到Redis
+cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
 r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+with cnx.cursor() as cursor:
+    select = """select relationName, relationId from klb_company"""
+    cursor.execute(select)
+    results = cursor.fetchall()
+    for result in results:
+        name = result[0]
+        xydm = result[1]
+        item = f'{name}|{xydm}'
+        r.rpush('SousuoBaidu:companyname', cell_value)

-# 列表名称
-list_name = 'BaseInfoEnterpriseMz:gnqy_socialCode'
-
-# 获取列表中的所有元素
-elements = r.lrange(list_name, 0, -1)
-
-# 遍历列表中的元素
-for element in elements:
-    # 获取元素在列表中的数量
-    count = r.lrem(list_name, 0, element)
-    # 如果数量大于1，说明有重复值，删除多余的重复值
-    if count > 1:
-        r.lrem(list_name, count - 1, element)
-
-# 打印处理后的列表
-print(r.lrange(list_name, 0, -1))
--- a/comData/BaseInfo_qcc/test_1.py
+++ b/comData/BaseInfo_qcc/test_1.py
+import pandas as pd
+# from pandas import DataFrame as df
+import pymysql
+
+cnx = pymysql.connect(host='114.116.44.11', user='caiji', password='f7s0&7qqtK', db='clb_project', charset='utf8mb4')
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# df_all = pd.read_excel('D:\\企业数据\\数据组提供\\国内企业.xlsx', dtype=str)
+with cnx.cursor() as cursor:
+    select = """select relationName, relationId from klb_company"""
+    cursor.execute(select)
+    results = cursor.fetchall()
+    # print(results)
+    for result in results:
+        name = result[0]
+        xydm = result[1]
+        with cnx.cursor() as cursor:
+            update = '''update sys_base_enterprise set name = %s where name is null and social_credit_code = %s'''
+            cursor.execute(update, (name, xydm))
+            cnx.commit()
+            print(f'{name}==={xydm}更新完成')
+
+
+# list_xydm = ['91110000102017145R','911100001021096991','9111000010285973X7','91110000108283057Y','911100003180821571','91110000322283429E','91110000336431162N','911100005620857121','911100006000036940','911100006000107204','911100006000348885','911100006004827014','911100006337095702','91110000633713369X','911100006337942853','9111000063379674X4','9111000066990444XF','91110000672354637K','91110000700004889C','91110000700006921H','91110000700049024C','91110000700084217T','911100007001499141','9111000070038501XJ','91110000710923360K','91110000710924945A','911100007109255774','911100007177242684','91110000723951109B','91110000726360320G','91110000726362190T','911100007263731643','911100007423131451','91110000754166859U','91110000767525590U','91110000771589298U','911100007776681570','91110000783967006U','91110000802062000J','91110101101100895C','91110101335453570K','91110101355304193A','91110101579007657G','91110101783962889A','911101020592352188','911101021011011341','91110102634381829U','91110102674290067J','91110102685772854R','9111010278170742XX','91110102MA01FFJ36J','91110105051390889B','91110105101756720B','91110105306495333L','91110105306737662D','911101053179472352','91110105318247193G','91110105335500066Q','9111010535131161X2','91110105357967759L','91110105397625067T','91110105400614650L','91110105562128137P','91110105575219505U','911101055790551576','91110105585848161G','911101055938354164','911101055977289680','91110105600015572M','91110105625911031F','911101056336607540','91110105664618436J','91110105669928206J','91110105672840619D','91110105679620184F','911101056876404680','91110105690843864U','91110105726334827M','91110105756025873C','91110105756700197H','9111010576143898XE','91110105764202737L','91110105777670900X','91110105783991313X','911101057877635020','91110105790696320H','91110105801719541B','9111010580171955X3','9111010580177089XM','91110105802095822J','91110105MA002Q6M79','91110105MA003RD50R','91110105MA004C0H06','91110105MA00AGXN3L','91110105MA00FJHN72','91110105MA01AEWR5C','91110105MA01L9PH51','911101060695678147','911101060741434189','91110106101133080K','91110106306572212M','91110106351301243L','91110106567475437Y','911101065768942978','91110106585840012D','91110106587714554K','91110106593832696G','91110106633760720H','91110106633764772R','91110106663111019U','9111010667059416X2','91110106675098771D','911101066932508023','91110106749395454K','911101067533312850','9111010676504112XW','911101067855339571','911101068022066683','91110106MA0056B19T','91110106MA005DBW1G','91110106MA01P1RE3Y','91110107102288949G','91110107587683145R','91110107MA009GQ72T','911101080513793057','91110108061322142F','911101080627636876','91110108062782191G','911101080628016980','911101080649193741','91110108067265302X','91110108074122078Y','911101080765656577','91110108078505359A','91110108078545633M','911101080785732550','91110108089647010H','911101080918560737','91110108093369842B','91110108096441731D','911101080984827059','91110108099067984A','91110108099442801R','9111010810110401X3','911101081011420915','91110108101609659C','91110108101880422A','911101081020223907','9111010810202736X2','91110108102094378J','911101083065093288','91110108306623614J','91110108318056936P','91110108318058456U','9111010831813798XE','91110108327142377N','911101083271749266','911101083272391527','91110108335481926M','91110108335562435H','911101083363962058','911101083398292057','91110108344290793F','91110108344314759F','911101083443180558','911101083443783306','91110108344403743F','91110108355313321X','91110108400001643B','91110108551427625G','91110108554837179A','91110108554890762H','91110108560358422K','91110108560385447N','91110108560432856H','91110108562135265P','91110108563622495U','91110108565780884D','9111010856749593XU','911101085694855326','911101085694925139','91110108569524423F','911101085712035817','9111010857128414X8','911101085712845102','91110108576914390R','91110108576914817K','911101085790313156','911101085844819439','91110108585861972A','9111010858587583XQ','91110108587665983J','911101085890746187','91110108590662476F','911101085923662400','9111010859963405XW','91110108599644434U','91110108599663854W','911101086000694820','911101086003726929','91110108600404359L','91110108633708906M','91110108660513776K','91110108661550528Q','911101086615579497','91110108662151975E','91110108662170324C','911101086621777295','911101086631036849','911101086631154075','91110108663124944W','91110108663136638D','91110108664619674E','911101086656289355','91110108666258040N','911101086684483666','91110108671727577D','91110108672826657J','911101086738170589','9111010867662354XX','911101086766404898','91110108679604408D','91110108679611421U','911101086796241695','911101086804563776','91110108682894987G','911101086835621402','91110108686919328W','911101086883662373','91110108690011590J','91110108693213091F','911101086950387332','9111010869504894XN','91110108696323261L','91110108699627252X','91110108700235062K','91110108718777804Y','91110108718785556J','91110108722617934K','9111010872357215XK','91110108723952478G','911101087263410239','91110108733464566A','911101087355893625','9111010873559070X0','91110108735591489G','91110108737656338N','911101087376669155','91110108740421820F','91110108746113570P','91110108746729965F','91110108752161931Y','91110108753327825C','9111010875333972X7','9111010875467591XH','91110108754681201W','91110108758242935T','911101087582455976','911101087601419302','9111010876142254XU','91110108762181186P','911101087629781362','911101087635308194','91110108766287121Q','911101087684682847','91110108768471723F','91110108769354705D','91110108769356188B','91110108769900489W','911101087704233332','91110108770425654N','91110108771981556U','91110108771986242H','91110108773361465H','91110108773369432Y','911101087740615606','91110108774714285P','91110108775491714G','91110108776352708P','91110108777650264L','911101087776681301','911101087795289672','9111010878020592XF','91110108780217285R','91110108780238166U','91110108780955384Y','91110108780964686N','91110108781703664R','91110108782543551R','91110108783218849X','911101087839528242','911101087861701904','91110108790650445R','911101087921006070','911101087934019542','91110108794082078E','911101087951375794','91110108797552733T','91110108798525948B','911101087990254941','911101088011707638','91110108802021110U','911101088020333577','91110108802041787A','91110108802045657E','91110108802068007C','911101088020726207','91110108802109673L','91110108MA001N718J','91110108MA0021P69M','91110108MA002XL790','91110108MA003LNY5D','91110108MA003TAB64','91110108MA003YWP4D','91110108MA0043KP9E','91110108MA004F704R','91110108MA004LW69T','91110108MA004RAE05','91110108MA0068GY1F','91110108MA006K8Y3P','91110108MA0071CR55','91110108MA007H3P5K','91110108MA0086HR6G','91110108MA008DA429','91110108MA008HB66A','91110108MA008P9657','91110108MA008PK575','91110108MA0092QT4X','91110108MA00AGM13W','91110108MA00AU927M','91110108MA00DCJ01Y','91110108MA00DE1B2B','91110108MA00FA7E5C','91110108MA00GUD41A','91110108MA0188DW84','91110108MA018J4L08','91110108MA018MCC6M','91110108MA01BBB16K','91110108MA01BP1P7B','91110108MA01C8JR79','91110108MA01DMU77F','91110108MA01DNC75B','91110108MA01EGPQX2','91110108MA01RCWH0M','91110108MA01RWUG4Y','91110108MA01WQE10K','91110109330285061E','91110109567452606A','91110109590674493W','9111011159606037XJ','911101116812383633','91110111700001063P','91110111MA003JG31Y','91110111MA01L2H65N','91110112551358631R','91110112700216160K','911101127177330338','911101127400501696','911101127415832828','9111011276218407XN','91110112766758720D','911101127889851669','9111011279904576XL','91110113080516727E','91110113306541555R','91110113576855941L','911101136812208172','91110113696302276M','91110113741581703F','91110113752642938G','91110113756000350K','911101137577358263','91110113762992739Q','91110113MA001GWR0M','9111011408549335X2','9111011455135477XA','91110114582515556F','91110114589114325P','91110114590663348R','91110114600067778R','911101146796092682','911101146804798353','91110114682851688K','91110114685107782U','911101146900106275','911101147226688971','911101147426127944','91110114744716255J','9111011475010452XE','91110114750144214X','911101147667528632','911101147770556682','91110114MA001D4X3K','91110114MA01AEDF61','911101151016193470','911101151029162045','91110115576904205N','91110115733451490U','91110115736468984G','91110115746112690C','91110115MA0048EL1E','91110115MA017K5L4X','91110116064905925Y','911101165996396434','91110116767502874D','91110116MA005B3L58','91110116MA01C0AY5K','91110117330386452K','911102283272479535','911102283512805187','91110228582505681F','9111022867876096X3','911102287177842959','91110228754175237Y','91110228MA006GMF6R','9111030205136463XD','91110302053604529E','91110302057391444C','9111030210221806X9','911103021022784175','91110302306784047Q','911103025604366893','91110302565797010A','91110302565820110R','911103026003405002','91110302677444199R','911103026787533566','911103026857985287','91110302735090430Y','911103027493533932','911103027493534308','9111030276350109XG','9111030278250283XW','91110302801786752A','91110302MA0048YP1U','91110302MA005FFW29','91110302MA0066E64R','91110302MA008RUM5Y','91110302MA00AR3F76','91110302MA00B9G54G','91110302MA00B9MQ4G','91110302MA00BJ6B78','91110302MA00G8EH41','91110302MA00GQGB73','91110302MA00GRMLX4','91110302MA01AAXW1T','91110302MA01HEH15A','91110400MA029M4P80','91120000058736889L','91120000103069967Y','91120000103870914U','91120000741366579H','91120000761253280R','911200007676306733','91120000MA06F32U06','911201046630720486','91120104789385824Y','911201048034181441','91120110083028075A','91120110300659413H','91120111103789059M','911201116847488286','91120111697419046H','91120111722991870E','91120111741361313C','911201117925370324','91120112064042488E','91120112093771153W','911201127803488406','91120112MA05WM7M02','91120113079635948K','91120113660321205C','911201137303863474','91120113783335092P','911201160587336021','911201160612051730','91120116086586515N','91120116103481433E','91120116239661863L','91120116239663439U','91120116300452033U','911201163409833307','9112011655651308XJ','91120116562678278A','91120116586419887T','91120116592916759Q','91120116600910892X','911201166630834172','91120116671457175N','91120116675967105W','91120116697408240K','91120116712934952M','91120116718278597H','911201167244641345','91120116730357968N','911201167328190464','91120116735474530F','91120116746652267N','911201167491124502','91120116758137027D','91120116764348197P','911201167803339648','911201167833047124','91120116794980409G','911201167972829995','91120116MA05PQB5XT','91120116MA069EXE4T','91120116MA06DRM4XY','91120116MA0705BL96','911201186670532667','91120118735488182M','91120118762158867F','91120118MA05JQUK0G','91120118MA05QFTE3C','91120118MA05T81X8A','91120118MA0697LP9T','91120118MA06T62187','911202216877459052','91120222566105610W','91120222575108434H','911202227706300842','91120222MA05KHKY2P','91120222MA05UAG55H','91120223600894351U','91120223761280668D','91120224300621490X','911202245661215811','91120224681877747F','91120224700557176T','91120224MA07871882','9113000023565800XC','91130000752446136W','911301001044060055','91130100107744755W','91130100601090291K','91130100689298985P','91130100732910720N','91130100732914772Y','91130100745411306F','911301007468556979','91130100754027891A','91130100776179546U','91130100787019708G','91130101678512755X','91130101789833818T','91130104784084838J','911301257898318475','91130132MA0A7AYE2H','91130181791385313K','91130181791386236G','91130182685711699G','91130182791357005D','91130183575506723L','911301836870224839','911301837713256634','911301847216647980','911301856652827511','91130185669060689W','911301857233544863','9113018576519998X9','91130193074894510E','911301931078905417','911302005661986189','911302006799397935','9113020068276818X4','91130204347873513P','91130224666556267M','91130225MA07U3734B','91130229721600380L','91130281MA07KE3A17','91130282554499915A','91130282750290545E','91130282MA07P2E981','91130283052683448M','91130283601019508G','91130293096112137N','911302936746855014','91130293774420041F','91130293796568127H','91130294308381129A','91130300601108025E','91130301329656355R','91130301601147147J','91130301678536714D','9113030168136727XL','91130301MA08XEAB6Q','911303035673840924','91130303673240113T','91130392601151496U','91130400730275049G','91130400MA08CCBX29','91130405748493781M','91130407MA0CUE7R5R','91130408MA08XFQJ61','911304246843413669','91130424757510432X','91130426699207653P','91130429679913817F','91130434MA07R66J0A','91130435564863776A','911304816690569897','91130481721643479D','91130500769806003D','91130501693478268W','91130525721609633G','911305287343786273','91130528737368715C','91130528743430458K','91130532723397101T','91130582095633598M','911306007006711044','911306055728149239','91130605596834603J','91130605601201668M','911306056746516436','91130606550419199D','91130637752422695J','9113070556195375XQ','91130705769821035L','911307317870236272','91130800757548430L','91130802728832010D','91130803MA0CMRHN8L','91130824771327626D','9113090010971869X1','91130900670338967F','91130900700660368J','91130900765171063F','911309007681306540','91130900779198582P','91130900791398851A','911309033200553935','91130922687004365L','91130923732923871M','9113092510971914XX','91130927789824567A','911309313081379192','91130931329627183Q','91130931557675726N','91130931MA07MEM874','91130981596828756D','91130981763428435T','911309825809745213','911309826760246784','911309827401835863','91130983MA07N7T53G','91130984567358986H','911310001057748114','9113100060134890XT','911310007634343680','91131022336194910M','9113102258690708X9','91131025731429118G','9113102630827362XB','91131028MA07KQYW1M','91131081335912618X','9113108267322544XE','91131082755457551W','91131100109804512G','91131100700865494B','91131100746851872J','91131101236298229N','911311017913820594','91131101MA08EWC63Q','911311220826746736','91131125780842443M','91131127560486483D','911311816882161913','91131181700712973X','91131182093289869R','91131182109874836Y','91131182779189192D','91140000056278968H','91140000160963703Y','91140000330566883Q','911400005973987278','91140000701000732H','91140000715931861P','9114000074855218XX','91140100110047117B','91140100556560310M','91140100568462347Q','91140100578457859T','91140100586171535D','91140100713634804H','91140100713674988T','91140100715946502L','91140100754093899G','91140100792241864R','91140108731935643H','91140121757294792H','91140200770127753X','91140300694291892Q','91140300748578443L','91140311666645518N','91140322110721968E','911404001107700495','911405003257661198','91140500586185996N','91140522MA0JRG8Q99','911406006024604424','91140700065564755Y','91140700719819164X','91140800556559520F','91140800733994655W','911408007540500477','911408227011988570','9114082370110438XN','91140900798276152L','91140930794219089L','91140932729686916F','911500001141618816','91150000733284733B','91150100573268485R','911502047971536367','91150291594612345H','91150291701423911F','91150291787086089U','91150800701444800H','911525007116525588','91210000118887313L','91210000242666665H','91210000686609602P','91210000738792171J','912101000016232858','91210100117812926M','91210100243490227Y','912101006046149869','912101006625215774','91210100769563590L','91210100798474220Q','912101047845707057','91210105057192314D','91210105564689755B','91210105798464057N','912101063132548617','912101066671654449','912101066874643611','912101067386643481','91210106755504303X','91210106760090619H','91210106769599542Y','91210112675348347K','9121011271579529XH','91210112738671871J','91210112769598654A','91210112MA0P432U8R','91210112MA0P44NRXH','91210113088956102C','91210113578366586N','912101137555387734','91210113760060444N','91210114340680807E','91210181MA0XQF19XH','9121020011831278X6','91210200118561313C','91210200241297917U','912102006048648626','91210200677529168F','91210200716992578X','91210200723495318L','912102007409045158','91210200744362020N','91210200751579797A','91210200MA0TR2P80G','91210204MA0QDTY23G','91210211MA0Y19KN3J','91210212732749973K','91210213604838795D','912102137327794199','912102137920497177','91210231736407196M','91210231756073509F','91210242118382526E','91210242728848952B','91210244559824828B','91210283696011524C','91210300MA0TT2DH9R','91210381241525115T','91210381567557686J','91210381603655081B','9121060008113718XY','91210600120109772C','91210600242814525N','912106005909453539','91210682781643139U','912107000721599341','912107002420322837','91210700577233300Y','91210700736737822M','91210700749779175E','91210700768337030B','912108006768912029','912108006926672350','91210900584194995N','912109007016848390','91210921MA0XX7NM2T','9121100059093999XJ','91211000726845918Y','9121100474278967XK','91211021590945396H','91211103MA102UN249','91211200561382299U','912112006737775195','9121122106407122X3','91211300791572581J','9121140055815624XQ','91211400747119974B','91220000664275090B','9122000070222720XH','91220000786819498L','91220101050518975F','91220101081849654U','912201012449758167','91220101310012867G','91220101697761845P','91220101727117306C','91220101730777372U','912201017484274776','912201017561541220','912201017671658636','912201017671930129','912201017710567829','91220101794404583W','91220101794442483P','91220101MA14TY564H','91220104675648489W','912202011239483018','912202016051690282','91220201682611844F','91220201786812798U','91220300565092475E','91220422MA0Y3F777J','91220501126870028U','91220501244575134M','91220501244583871H','91220501723101462L','91220501791105350N','91220521MA173E261W','91220523660141001R','9122082155977797XR','912224037742347248','912301001275921118','91230100607168790X','91230108127420096N','91230108MA1B0JHY73','912301107631541551','912301993011658539','91230199301195470J','91230199301211856H','9123019958512805X3','91230199696825683P','91230199775036754Q','91230199MA18XA396K','91230300130721906W','91230600560617893B','91230600MA1BF4U29A','912306056926467095','912312815838229881','91233001569893325G','91310000051240362X','9131000005124956XX','9131000005304658XH','91310000057656705A','913100000625940784','91310000076492259A','91310000080013687R','913100001321244277','913100001321644452','913100001322131129','91310000132231361P','91310000132653687M','91310000132660318J','9131000013297865X2','9131000013413459XC','91310000300253536H','91310000301354857P','91310000312519282U','91310000324299264L','913100003244893596','91310000342056098N','913100003508461023','91310000351008055W','913100005515491712','91310000552962929G','91310000557430243L','913100005665114915','91310000568072146K','913100005708082124','91310000572698184Q','91310000582138631D','91310000582139781F','913100005867988561','91310000590384058P','91310000590397350D','913100006072612077','91310000607272280Q','91310000607286404W','913100006072944121','9131000060729499X9','91310000607311067X','91310000607339123C','913100006073622866','91310000607370331G','913100006073785958','91310000607403041J','91310000607404087G','91310000607422576R','913100006074261470','91310000607431720X','9131000060751688XT','913100006075916282','91310000607601064L','9131000063021103X7','91310000630453442X','91310000630483465G','91310000630948912G','91310000630965915K','91310000631137409B','91310000631191552K','913100006314149553','913100006314627462','91310000631521822M','91310000631534594F','913100006317557680','9131000066240918XU','91310000667780236Q','91310000669359189D','91310000669363292T','91310000669421384T','913100006711091037','91310000674575425N','913100006746031318','91310000680976508E','91310000680999558Q','91310000682254509X','91310000682263886E','913100006873885738','91310000690125272H','91310000692998798F','9131000069420172XB','9131000069578172XC','91310000695810746C','91310000697295223K','913100006988365624','913100007030116706','913100007030557379','913100007030973396','91310000703147746G','91310000703340159B','913100007294735903','91310000729493479N','91310000733344636F','91310000733365971U','91310000734057153P','91310000734081815D','91310000735408592G','913100007381411253','91310000738505304H','913100007385256042','913100007421053624','91310000747273971D','91310000748756174J','913100007487913409','91310000751468181F','91310000751863771N','913100007518999777','91310000756110429R','9131000075842961XY','9131000075855850XT','91310000759040681R','91310000761199691M','913100007653010244','91310000765583375Y','91310000765596096G','91310000768354199F','913100007694197083','91310000770201458T','913100007714584745','91310000772115131G','91310000772864810L','913100007728924912','91310000773282177G','913100007743059833','91310000774323671U','9131000077478390X5','91310000775216587B','91310000775238065L','913100007757838991','91310000778930516R','913100007824379352','91310000784298270U','91310000784783241W','91310000787230976G','91310000787878254Q','91310000792703993P','91310000792783700P','91310000797050338W','91310000MA1FL74J78','91310000MA1G8BHPXX','91310000MA1H38T58K','91310000MA1H3GDC5H','91310000MA1J37FN5Q','91310000MA1K2Q6J2X','91310000MA1K35P57Y','91310000MA7CJ9P40C','913101040693974723','91310104301579458U','91310104342172646H','913101043423482187','9131010455298989X1','913101045758452582','9131010458529260X8','91310104669392966T','91310104674626798A','913101046855187256','91310104692921256Y','91310104742657562G','91310104776270040D','91310104MA1FR0P33B','91310104MA1FR9PL54','91310105074824416U','91310105090037252C','91310105312284129D','91310105779753697E','91310106066020397Q','91310106550090004W','91310106630236093C','91310106MA1FY9LT3N','91310107051295590B','913101076076323035','91310107781531233F','91310109312143131N','91310110054590464F','91310110078155571L','913101100861724784','91310110342313605X','913101103507613521','91310110351027504X','91310110351154941K','91310110591673062R','91310110757926286X','91310110787862412B','9131011205506145X2','913101121326732580','91310112301708379M','91310112350881637E','91310112350889276J','91310112351114237B','91310112557480662J','913101125868251134','91310112607425988Y','91310112607671054B','91310112630792962D','91310112767225977D','913101127785041388','913101127851867808','91310112789576698P','91310112MA1GB5HL74','91310112MA1GB63D5E','91310112MA1GBCU74Q','91310112MA1GBEPY9P','91310112MA1GBWLUXN','91310112MA1GC28U0L','91310112MA1GC78A07','91310112MA1GCHQP57','91310113086201072B','913101131345344112','91310113342290888U','91310113550058717X','91310113631482720W','91310113754764752Y','913101140693041410','91310114074811922G','91310114320742767K','9131011434217342X2','9131011455159938XA','913101145529068046','913101145619308064','91310114570796872F','91310114577469866W','9131011458207544XY','91310114588740092M','91310114591692730A','91310114630211689Y','913101146305896733','913101146308058904','913101146315357223','913101146319344919','91310114695793034W','913101147030104249','91310114754758651R','913101147557198576','91310114760573215T','913101147728614257','91310114781898318F','91310114MA1GT4926T','91310114MA1GURM19N','91310114MA1GW61HX2','91310114MA1GWJL62M','91310115051251125K','913101150512565326','913101150608727672','913101150609007219','91310115080028627C','9131011508201988XX','91310115084100518T','913101150938266958','913101151321295193','913101151339870722','913101153124932461','9131011532075221XC','91310115324253960J','91310115324284513E','9131011533262045X2','91310115332642560W','913101153986795507','91310115555949711X','91310115568057640Y','91310115569630816D','91310115570750452T','91310115572703801L','91310115599770596C','913101156822157531','91310115690170444F','913101156972022424','91310115703497359F','91310115767236430H','913101157732980993','91310115779776581R','91310115792736664G','91310115795654795D','91310115MA1H70PK5R','91310115MA1H727E7E','91310115MA1H7RLE45','91310115MA1H7W8439','91310115MA1H7W8514','91310115MA1H9HD02E','91310115MA1H9K3FX7','91310115MA1HATB40R','91310115MA1K39C71R','91310115MA1K3B1R09','91310115MA1K3BQK2U','91310115MA1K3CM30B','91310115MA1K3F6C05','91310115MA1K3K2N9H','91310115MA1K3KJW0N','91310115MA1K3MP458','91310115MA1K41R2X3','91310115MA1K493TXQ','91310115MA1K4CLB55','91310115MA1K4MF39X','913101160637712405','913101161321521531','91310116555985835J','913101165601545691','91310116563135240C','91310116566563515F','913101166607195719','91310116662495241T','91310116671156516L','91310116676273009U','91310116687330646Y','91310116759882926H','913101175559503333','9131011756018678XG','91310117574182309H','91310117598194355D','913101176311798956','91310117632167028T','91310117662458598U','91310117662473499L','91310117662485385P','91310117669377619D','913200001347587142','91320000583783720B','913200006082630012','91320000710929340E','91320000743141824Y','91320000751254554N','91320100093975981A','91320100134974572K','91320100135847161T','913201025628951334','91320191134955910F','91320191726079387X','913201921349556628','913201922497944756','91320200135890776N','91320200135914870B','913204002508323014','91320500741304044W','91320509138285715E','91320509796141166A','91320582134789270G','913207007322513070','91320982571427139M','91321291703974741U','913300001429120051','9133000014293866XE','91330000142941287T','91330000710924531U','91330100253930310D','91330110MA2CGBC056','913302001440685655','91330200704800698F','91340000148941616G','91370112MABYCTU036']
+# for xydm in list_xydm:
+#     for num_df in range(len(df_all)):
+#         social_code = str(df_all['social_code'][num_df])
+#         if social_code == xydm:
+#             com_name = str(df_all['name'][num_df])
+#             with cnx.cursor() as cursor:
+#                 update = '''update sys_base_enterprise set name = %s where social_credit_code = %s'''
+#                 cursor.execute(update, (com_name, xydm))
+#                 cnx.commit()
+#                 print(f'{xydm}===更新成功{com_name}')
+            # break
\ No newline at end of file
--- a/comData/Tyc/CorePerson.py
+++ b/comData/Tyc/CorePerson.py
@@ -45,6 +45,9 @@ def get_html(tycid, s, headers):
        # div_part.find('div', class_='dimHeader_root__XTCLe')
    except:
        return -1
+    if div_part is None:
+        return -2
+    else:
        try:
            tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
            if '最新公示' in tmp_field:
@@ -64,7 +67,10 @@ def get_page(url, s, headers):
    if res.status_code != 200:
        raise
    data_page = res.json()
+    try:
        total_page_ = data_page['data']['total']
+    except:
+        raise
    return total_page_


@@ -77,11 +83,12 @@ def doJob():
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
-            'Connection': 'keep-alive',
+            # 'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'version': 'TYC-Web'
        }
-        cookies_list, id_cookie = token.get_cookies()
+        cookies_list, id_cookie, user_name = token.get_cookies()
+        log.info(f'=====当前使用的是{user_name}的cookie======')
        cookies = {}
        for cookie in cookies_list:
            cookies[cookie['name']] = cookie['value']
@@ -90,7 +97,7 @@ def doJob():
        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        # social_code = baseCore.redicPullData('CorPersonEnterprise:gnqy_socialCode')
        # 判断 如果Redis中已经没有数据，则等待
-        social_code = '911101067916069050'
+        social_code = '91110108780992804C'
        if social_code == None:
            time.sleep(20)
            continue
@@ -163,6 +170,11 @@ def doJob():
                log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
                time.sleep(2)
                continue
+            elif charge == -2:
+                # 该企业没有人员信息
+                log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
+                continue
+
            elif charge == 0:
                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
@@ -240,6 +252,8 @@ def doJob():
                    pass
                else:
                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                # todo: 关闭连接
+                res.close()
                if flag == 1:
                    for one_info in list_all:
                        name = one_info['name']

--- a/comData/Tyc/CorePerson_Update.py
+++ b/comData/Tyc/CorePerson_Update.py
+"""
+天眼查人员信息
+问题1：页面和接口数据不一致 目前方法 单独处理
+问题2：页面人员总数拿的不够准确 目前方法 修改获取父标签逻辑 已解决
+"""
+import datetime
+import json
+
+import requests, time
+from bs4 import BeautifulSoup
+import urllib3
+from retry import retry
+
+from base.BaseCore import BaseCore
+from getTycId import getTycIdByXYDM
+baseCore = BaseCore()
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+log = baseCore.getLogger()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
+
+list_all_1 = []
+list_all_2 = []
+taskType = '天眼查/核心人员更新'
+from lxml import etree
+from classtool import Token, File, Tag
+token = Token()
+
+@retry(tries=3, delay=1)
+def get_html(tycid, s, headers):
+    url = f"https://www.tianyancha.com/company/{tycid}"
+    # ip = baseCore.get_proxy()
+    response = s.get(url=url, headers=headers)
+    if response.status_code == 200:
+        pass
+    else:
+        raise
+        # return -1
+    soup = BeautifulSoup(response.content, 'html.parser')
+    try:
+        div_part = soup.find('div', attrs={'data-dim': 'staff'})
+        # div_part.find('div', class_='dimHeader_root__XTCLe')
+    except:
+        return -1
+    if div_part is None:
+        return -2
+    else:
+        try:
+            tmp_field = div_part.find('div', class_='dim-tab-root').find('span').text
+            if '最新公示' in tmp_field:
+                total = div_part.find('div', class_='dim-tab-root').find('span').get_text().split('最新公示')[1].replace(' ', '')
+                return int(total)
+            else:
+                return -1
+        except:
+            return 0
+
+
+@retry(tries=3, delay=1)
+def get_page(url, s, headers):
+    ip = baseCore.get_proxy()
+    res = s.get(url=url, headers=headers, proxies=ip)
+    time.sleep(1)
+    if res.status_code != 200:
+        raise
+    data_page = res.json()
+    try:
+        total_page_ = data_page['data']['total']
+    except:
+        raise
+    return total_page_
+
+
+def doJob():
+    # for social_code in social_code_list:
+    while True:
+        # todo:设置cookies的使用
+        headers = {
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'zh-CN,zh;q=0.9',
+            'Cache-Control': 'max-age=0',
+            # 'Connection': 'keep-alive',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'version': 'TYC-Web'
+        }
+        cookies_list, id_cookie, user_name = token.get_cookies()
+        log.info(f'=====当前使用的是{user_name}的cookie======')
+        cookies = {}
+        for cookie in cookies_list:
+            cookies[cookie['name']] = cookie['value']
+        s = requests.Session()
+        s.cookies.update(cookies)
+        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
+        item = baseCore.redicPullData('UpdateCoreperson:SocialCode_CompanyName')
+        # 判断 如果Redis中已经没有数据，则等待
+        # social_code = '91110108780992804C'
+        if item == None:
+            time.sleep(20)
+            continue
+        start = time.time()
+        social_code = item.split('|')[0]
+
+        try:
+            data = baseCore.getInfomation(social_code)
+            if len(data) != 0:
+                id = data[0]
+                com_name = data[1]
+                xydm = data[2]
+                tycid = data[11]
+                count = data[17]
+            else:
+                # 数据重新塞入redis
+                # log.info(f'数据库中无该企业{social_code}')
+                sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
+                cursor.execute(sql)
+                data = cursor.fetchone()
+                if data:
+                    pass
+                else:
+                    #数据库中并没有该企业 需要新增
+                    pass
+                id = data[0]
+                com_name = data[3]
+                xydm = data[1]
+                conut = 0
+                # 写入数据库
+                insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
+                cursor_.execute(insert, (com_name, xydm))
+                cnx_.commit()
+                tycid = ''
+            if tycid == None or tycid == '':
+                try:
+                    retData = getTycIdByXYDM(com_name, s)
+                    if retData['state']:
+                        tycid = retData['tycData']['id']
+                        # # todo:写入数据库
+                        updateSql = f"update EnterpriseInfo set TYCID = '{tycid}' where SocialCode = '{xydm}'"
+                        cursor_.execute(updateSql)
+                        cnx_.commit()
+                    else:
+                        state = 0
+                        takeTime = baseCore.getTimeCost(start, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                        log.info(f'======={social_code}====重新放入redis====')
+                        baseCore.rePutIntoR('UpdateCoreperson:Error', item)
+                        continue
+                except:
+                    state = 0
+                    takeTime = baseCore.getTimeCost(start, time.time())
+                    baseCore.recordLog(social_code, taskType, state, takeTime, '', '获取天眼查id失败')
+                    baseCore.rePutIntoR('UpdateCoreperson:Error', item)
+                    continue
+            count = data[17]
+            log.info(f"{id}---{xydm}----{tycid}----开始采集核心人员")
+            list_one_info = []
+            num = 1
+            try:
+                charge = get_html(tycid, s, headers)
+            # 页面请求三次都失败
+            except:
+                charge = -1
+
+            t = int(time.time() * 1000)
+            if charge == -1:
+                token.updateTokeen(id_cookie, 2)
+                # 重新塞入redis
+                baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                log.info(f"{id}---{xydm}----{tycid}----请求失败----重新放入redis")
+                time.sleep(2)
+                continue
+            elif charge == -2:
+                # 该企业没有人员信息
+                log.info(f"{id}---{xydm}----{tycid}----没有核心人员")
+                continue
+
+            elif charge == 0:
+                log.info(f"{id}---{xydm}----{tycid}----没有最新公示")
+                url1 = f'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page1 = get_page(url1, s, headers)
+                except:
+                    total_page1 = 0
+                url = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_={}&gid={}&pageSize=20&pageNum={}'
+                total_page = total_page1
+                flag = 2
+            else:
+                log.info(f"{id}---{xydm}----{tycid}----有最新公示")
+                url2 = f'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                url3 = f'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={t}&gid={tycid}&pageSize=20&pageNum=1'
+                try:
+                    total_page2 = get_page(url2, s, headers)
+                except:
+                    total_page2 = 0
+                time.sleep(1)
+                try:
+                    total_page3 = get_page(url3, s, headers)
+                except:
+                    total_page3 = 0
+                if total_page2 == charge:
+                    url = 'https://capi.tianyancha.com/cloud-listed-company/listed/noRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                    total_page = total_page2
+                    flag = 1
+                else:
+                    if total_page3 == charge:
+                        url = 'https://capi.tianyancha.com/cloud-listed-company/listed/getHkNoRepeatSeniorExecutive?_={}&gid={}&pageSize=20&pageNum={}'
+                        total_page = total_page3
+                        flag = 3
+                    else:
+                        total_page = 0
+                        flag = 0
+                        baseCore.rePutIntoR('UpdateCoreperson:Map', item)
+                        log.info(f'{id}---{xydm}----{tycid}----页面和接口数据不对应---{charge}---{total_page2}---{total_page3}')
+                        continue
+            if total_page == 0:
+                token.updateTokeen(id_cookie, 2)
+                # 重新塞入redis
+                baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                log.info(f'==={social_code}=====总数请求失败===重新放入redis====')
+                continue
+            # # todo:获取页数
+            # total_page = 34
+            # flag = 2
+            # todo: 测试程序是否执行到这一步
+            log.info(f'总数为{total_page}')
+            for page in range(1, int((total_page / 20) + 1) + 1):
+                res = None
+                for c in range(3):
+                    ip = baseCore.get_proxy()
+                    url_ = url.format(t, tycid, page)
+                    # url_ = 'https://capi.tianyancha.com/cloud-company-background/company/dim/staff?_=1706765329671&gid=8715844&pageSize=20&pageNum=1'
+                    res = requests.get(url_, headers=headers, proxies=ip, verify=False)  # ,verify=False
+                    time.sleep(1)
+                    if res.status_code == 200:
+                        break
+                    else:
+                        if c == 2:
+                            break
+                        continue
+                if res:
+                    pass
+                else:
+                    token.updateTokeen(id_cookie, 2)
+                    # 重新塞入redis
+                    baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+                    log.info(f'{id}---{xydm}----{tycid}----高管信息请求失败')
+                    continue
+                # todo:test测试
+                log.info(f'{id}---{xydm}----{tycid}----{res.json()}')
+                try:
+                    list_all = res.json()['data']['dataList']
+                except:
+                    list_all = res.json()['data']['result']
+                if list_all:
+                    pass
+                else:
+                    log.info(f'{id}---{xydm}----{tycid}----没有高管信息')
+                # todo: 关闭连接
+                res.close()
+                if flag == 1:
+                    for one_info in list_all:
+                        name = one_info['name']
+                        sex = one_info['sex']
+                        education = one_info['education']
+                        position = one_info['position']
+                        Salary = one_info['salary']
+                        # todo:获取当前年份
+                        now = datetime.datetime.now()
+                        year = now.year
+                        try:
+                            birthYear = year - int(one_info['age'])
+                        except:
+                            birthYear = ''
+                        StockKeepings = one_info['numberOfShares']
+                        currentTerm = one_info['term']
+                        personInfo = one_info['resume']
+
+                        try:
+                            person_img = one_info['logo']
+                        except:
+                            person_img = '--'
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        dic_json_img = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": StockKeepings,
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm,
+                            "personInfo": personInfo,
+                            "头像": person_img,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+                        # list_all_2.append(dic_json_img)
+                elif flag == 3:
+                    for one_info in list_all:
+                        name = one_info['personal_name']
+                        try:
+                            sex = one_info['gender2']
+                        except:
+                            sex = ''
+                        education = ''
+                        position = one_info['position_name']
+                        Salary = ''
+                        try:
+                            birthYear = one_info['year_of_birth']
+                        except:
+                            birthYear = ''
+                        personInfo = one_info['resume_cn']
+                        try:
+                            timestamp = int(one_info['employ_date']) / 1000
+                            currentTerm = time.strftime("%Y-%m-%d", time.localtime(timestamp))
+                        except:
+                            currentTerm = ''
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": sex,
+                            "education": education,
+                            "position": position,
+                            "salary": Salary,
+                            "birthYear": birthYear,
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": currentTerm + '至-',
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+                else:
+                    for one_info in list_all:
+                        name = one_info['name']
+                        try:
+                            position = one_info['typeSore']
+                        except:
+                            position = ''
+
+                        person_id = one_info['id']
+                        person_url = f'https://www.tianyancha.com/human/{person_id}-c{tycid}'
+                        # person_res = requests.get(person_url, headers=headers, proxies=ip)
+                        person_res = requests.get(person_url, headers=headers)
+                        person_soup = BeautifulSoup(person_res.content, 'html.parser')
+                        try:
+                            personInfo = person_soup.find('span', {'class': '_56d0a'}).text.strip()
+                        except:
+                            personInfo = ''
+                        try:
+                            person_img = one_info['logo']
+                        except:
+                            person_img = '--'
+                        dic_json = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": '',
+                            "education": '',
+                            "position": position,
+                            "salary": '',
+                            "birthYear": '',
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": '',
+                            "personInfo": personInfo,
+                            "sort": str(num)
+                        }
+                        dic_json_img = {
+                            "socialCreditCode": social_code,
+                            "name": name,
+                            "sex": '',
+                            "education": '',
+                            "position": position,
+                            "salary": '',
+                            "birthYear": '',
+                            "shareNum": '',
+                            "shareRatio": '',
+                            "benefitShare": '',
+                            "currentTerm": '',
+                            "personInfo": personInfo,
+                            "头像": person_img,
+                            "sort": str(num)
+                        }
+                        num = num + 1
+                        list_one_info.append(dic_json)
+            # print(list_one_info)
+            json_updata = json.dumps(list_one_info)
+            if json_updata == '[]':
+                continue
+            else:
+                pass
+            # response = requests.post('http://114.115.236.206:8088/sync/executive', data=json_updata, timeout=300,
+            #                          verify=False)
+            # print(response.text)
+            log.info('=========成功======')
+            token.updateTokeen(id_cookie, 3)
+            time.sleep(10)
+        except Exception as e:
+            log.info(f'==={social_code}=====企业核心人员采集失败===重新放入redis====')
+            log.info(e)
+            # 重新塞入redis
+            baseCore.rePutIntoR('UpdateCoreperson:SocialCode_CompanyName', item)
+            state = 0
+            takeTime = baseCore.getTimeCost(start, time.time())
+            baseCore.recordLog(social_code, taskType, state, takeTime, '', f'获取企业信息失败--{e}')
+            time.sleep(5)
+        break
+
+# df_img = pd.DataFrame(list_all_2)
+# df_img.to_excel('企业主要人员-头像.xlsx',index=False)
+if __name__ == "__main__":
+    doJob()
\ No newline at end of file
--- a/comData/Tyc/baseinfo0130_tyc.py
+++ b/comData/Tyc/baseinfo0130_tyc.py
 # -*- coding: utf-8 -*-
+import datetime
 import json
 import re
 import time
@@ -13,7 +14,7 @@ from selenium.webdriver.support.wait import WebDriverWait
 db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '天眼查登录信息']
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
+from dateutil.relativedelta import relativedelta
 import sys
 # sys.path.append('D:\\KK\\zzsn_spider\\base')
 sys.path.append('D:\\kkwork\\zzsn_spider\\base')
@@ -56,6 +57,12 @@ def sendkafka(post_data):
        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
        log.info(f"{com_name}--{social_code}--kafka传输失败")

+def Lreputredis(company_field):
+    # todo: 重新放入redis
+    baseCore.r.lrem('BaseInfoEnterprise:gnqy_socialCode', 0, 'end')
+    baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+    baseCore.r.rpush('BaseInfoEnterprise:gnqy_socialCode', 'end')
+
 # 合并基本信息和工商信息字段
 def getinfo(dict1,dict2):
    # 取出两个字典的key值集合
@@ -320,17 +327,18 @@ def dic_handle(result_dic):
    }

    return aa_dict
+
 # 检查登陆状态
 def checklogin(key):

    t = int(time.time())
    # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
    url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
-    # ip = baseCore.get_proxy()
-    # req = requests.get(headers=headers, url=url, proxies=ip)
-    req = s.get(headers=headers, url=url)
-    time.sleep(1)
-    soup = BeautifulSoup(req.content, 'html.parser')
+    driver.get(url)
+    time.sleep(2)
+
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, 'html.parser')
    # todo:检查未登录状态
    # if soup.find('title').text == '会员登录 - 企查查':
    #     log.info('状态---未登录')
@@ -350,7 +358,8 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
        soup = checklogin(com_name)
    if not soup:
        log.info("登录失效===重新放入redis")
-        baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+        # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+        Lreputredis(company_field)
        token.updateTokeen(id_cookie,2)
        # log.info('=====已重新放入redis,失效cookies已删除======')
        time.sleep(20)
@@ -359,18 +368,23 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
        try:
            searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
        except:
-            log.info("登录失效===重新放入redis")
-            baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
-            token.updateTokeen(id_cookie,2)
-            log.info('=====已重新放入redis,cookies已封号======')
-            time.sleep(20)
-            return count
-        if searchinfo == '0':
+            try:
+                # todo:可能是搜不到该企业
+                errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
+                if '抱歉' in errormessage:
                    log.info('=====搜索不到该企业====')
                    data = [com_name, social_code]
                    # todo:搜不到的企业需要返回到一个表格中
                    file.appenddata(file_name, '需处理企业', data)
                    return count
+            except:
+                log.info("登录失效===重新放入redis")
+                # baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+                Lreputredis(company_field)
+                token.updateTokeen(id_cookie,2)
+                # log.info('=====已重新放入redis,cookies已封号======')
+                time.sleep(20)
+                return count
        else:
            # 开始采集
            try:
@@ -383,16 +397,17 @@ def redaytowork(com_name,social_code,securitiesCode, securitiesShortName, listin
                    return count
            except Exception as e:
                log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
-                baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+                # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+                Lreputredis(company_field)
                token.updateTokeen(id_cookie,2)
                log.info('=====已重新放入redis,cookies已封号======')
                return count


 def ifbeforename(company_url):
-
-    req_ = s.get(headers=headers, url=company_url)
-    com_soup = BeautifulSoup(req_.content, 'html.parser')
+    driver.get(company_url)
+    time.sleep(2)
+    com_soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
    except:
@@ -408,15 +423,84 @@ def ifbeforename(company_url):
    else:
        return ''

+#解析时间
+def paserTime(publishtime):
+    timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
+    current_datetime = datetime.datetime.now()
+    publishtime = publishtime.strip()
+    print(publishtime)
+    try:
+        if '年前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=365 * day)
+            publishtime = current_datetime - delta
+        elif '月前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            # delta = datetime.timedelta(months=day)
+            publishtime = current_datetime - relativedelta(months=day)
+            # publishtime = current_datetime - delta
+        elif '周前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(weeks=day)
+            publishtime = current_datetime - delta
+        elif '天前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=day)
+            publishtime = current_datetime - delta
+        elif '前天' in publishtime:
+            delta = datetime.timedelta(days=2)
+            publishtime = current_datetime - delta
+        elif '昨天' in publishtime:
+            current_datetime = datetime.datetime.now()
+            delta = datetime.timedelta(days=1)
+            publishtime = current_datetime - delta
+        elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
+            if '小时' in publishtime:
+                hour = publishtime.split("小时")[0]
+            else:
+                hour = 0
+            if hour != 0:
+                min = publishtime.split("小时")[1].split("分钟")[0]
+            else:
+                min = publishtime.split("分钟")[0]
+            delta = datetime.timedelta(hours=int(hour), minutes=int(min))
+            publishtime = current_datetime - delta
+        elif '年' in publishtime and '月' in publishtime:
+            time_format = '%Y年%m月%d日'
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+        elif '月' in publishtime and '日' in publishtime:
+            current_year = current_datetime.year
+            time_format = '%Y年%m月%d日'
+            publishtime = str(current_year) + '年' + publishtime
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+    except Exception as e:
+        print('时间解析异常！！')
+    return publishtime
+
 # 采集基本信息和工商信息
 def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, category, exchange, listType, ynDomestic, countryName, file_name):
    qccid = company_url.split('company/')[1]
    log.info(f'====={qccid}=====')
-
-    req_ = s.get(headers=headers, url=company_url)
-    com_soup = BeautifulSoup(req_.content, 'html.parser')
+    driver.get(company_url)
+    # req_ = s.get(headers=headers, url=company_url)
+    page_source_detail = driver.page_source
+    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
    #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
-    sourceUpdateTime = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
+    try:
+        sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
+        pattern = r'\d{4}-\d{2}-\d{2}'
+        matched = re.findall(pattern, sourceUpdateTime_)
+        if matched:
+            sourceUpdateTime = sourceUpdateTime_
+        else:
+            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
+    except:
+        log.info(f'天眼查无该企业{social_code}')
+        return

    try:
        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
@@ -502,50 +586,55 @@ def spiderinfo(company_url, securitiesCode, securitiesShortName, listingDate, ca
        print(aa_dic)
        # sendkafka(aa_dic)
        # print(aa_dic)
-        post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
-        dic_info = json.dumps(aa_dic)
-        req = requests.post(post_url, data=dic_info)
+        # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # dic_info = json.dumps(aa_dic)
+        # req = requests.post(post_url, data=dic_info)

    else:
-        data_baseinfo = baseinfo(com_soup)
-        # 主要针对香港台湾企业，社会信用代码传为给定的
-        try:
-            data_baseinfo['统一社会信用代码']
-        except:
-            log.info('未获取到统一社会信用代码')
-            if social_code:
-                data_baseinfo['统一社会信用代码'] = social_code
-            else:
-                # 如果未给定社会信用代码，则返回
-                return False
-        if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
-            data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
-        if data_baseinfo['企业名称'] == '-' and com_name:
-            data_baseinfo['企业名称'] = com_name
-        elif not com_name:
-            return False
-        else:
-            pass
-        # 采集成功的企业
-        data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
-        file.appenddata(file_name, '获取基本信息成功企业', data)
-        # 将字段转化成英文驼峰
-        aa_dic = dic_handle(data_baseinfo)
-        aa_dic['sourceUpdateTime'] = sourceUpdateTime
-        aa_dic['qccId'] = qccid
-        aa_dic['ynDomestic'] = ynDomestic
-        aa_dic['countryName'] = countryName
-        aa_dic['securitiesCode'] = securitiesCode
-        aa_dic['securitiesShortName'] = securitiesShortName
-        aa_dic['listingDate'] = listingDate
-        aa_dic['category'] = category
-        aa_dic['exchange'] = exchange
-        aa_dic['listingType'] = listType
-        # sendkafka(aa_dic)
-        print(aa_dic)
-        post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
-        dic_info = json.dumps(aa_dic)
-        req = requests.post(post_url, data=dic_info)
+        # todo: 重新放入redis 删除end再放入ruend
+        # baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+        Lreputredis(company_field)
+        log.error(f'未找到工商信息，重新塞入redis')
+
+        # data_baseinfo = baseinfo(com_soup)
+        # # 主要针对香港台湾企业，社会信用代码传为给定的
+        # try:
+        #     data_baseinfo['统一社会信用代码']
+        # except:
+        #     log.info('未获取到统一社会信用代码')
+        #     if social_code:
+        #         data_baseinfo['统一社会信用代码'] = social_code
+        #     else:
+        #         # 如果未给定社会信用代码，则返回
+        #         return False
+        # if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
+        #     data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
+        # if data_baseinfo['企业名称'] == '-' and com_name:
+        #     data_baseinfo['企业名称'] = com_name
+        # elif not com_name:
+        #     return False
+        # else:
+        #     pass
+        # # 采集成功的企业
+        # data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
+        # file.appenddata(file_name, '获取基本信息成功企业', data)
+        # # 将字段转化成英文驼峰
+        # aa_dic = dic_handle(data_baseinfo)
+        # aa_dic['sourceUpdateTime'] = sourceUpdateTime
+        # aa_dic['qccId'] = qccid
+        # aa_dic['ynDomestic'] = ynDomestic
+        # aa_dic['countryName'] = countryName
+        # aa_dic['securitiesCode'] = securitiesCode
+        # aa_dic['securitiesShortName'] = securitiesShortName
+        # aa_dic['listingDate'] = listingDate
+        # aa_dic['category'] = category
+        # aa_dic['exchange'] = exchange
+        # aa_dic['listingType'] = listType
+        # # sendkafka(aa_dic)
+        # print(aa_dic)
+        # # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # # dic_info = json.dumps(aa_dic)
+        # # req = requests.post(post_url, data=dic_info)

 def remove_parentheses(text):
    # 清除中文小括号
@@ -561,7 +650,8 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
        company_list = soup.find_all('div', class_='index_search-box__7YVh6')
    except:
        log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
-        baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+        # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+        Lreputredis(company_field)
        token.updateTokeen(id_cookie,2)
        log.info('=====已重新放入redis,cookies已封号======')
        return False
@@ -623,11 +713,30 @@ def spiderwork(soup, receptname, securitiesCode, securitiesShortName, listingDat
            return False
    return True

+def login():
+
+    # time.sleep(10)
+    cookies_list, id_cookie, user_name = token.get_cookies()
+    log.info(f'=====当前使用的是{user_name}的cookie======')
+    for cookie in cookies_list:
+        driver.add_cookie(cookie)
+    time.sleep(5)
+    driver.refresh()
+    # url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
+    # driver.get(url_test)
+    # # driver.get('https://www.qcc.com/')
+    time.sleep(5)
+    return driver,id_cookie

 if __name__ == '__main__':
    taskType = '基本信息/天眼查'
    # driver, id_cookie = login()
+    driver = create_driver()
+    url = 'https://www.tianyancha.com/'
+    driver.get(url)
+    driver.maximize_window()
    while True:
+        driver, id_cookie = login()
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
        file_name = f'./data/国内企业基本信息采集情况.xlsx'
        file.createFile(file_name)
@@ -644,7 +753,7 @@ if __name__ == '__main__':
        # cookies = {}
        # for cookie in cookies_list:
        #     cookies[cookie['name']] = cookie['value']
-        s = requests.Session()
+        # s = requests.Session()
        # s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
@@ -674,7 +783,8 @@ if __name__ == '__main__':
                if company_field:
                    flag = False
                    log.info("-----已添加数据------")
-                    baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+                    # baseCore.r.lpush('BaseInfoEnterprise:gnqy_socialCode', company_field)
+                    Lreputredis(company_field)
                    continue
            continue
        # company_field_ = f'|{company_field}'
@@ -701,7 +811,7 @@ if __name__ == '__main__':
        count = redaytowork(com_name, social_code, securitiesCode, securitiesShortName, listingDate, category, exchange,
                            listType, ynDomestic, countryName, file_name)
        time.sleep(10)
-        # break
+        break
        # baseCore.r.close()
        # baseCore.sendEmail(file_name)
        # 信息采集完成后将该企业的采集次数更新

--- a/comData/Tyc/baseinfotyc_update.py
+++ b/comData/Tyc/baseinfotyc_update.py
+# -*- coding: utf-8 -*-
+import json
+import re
+import time
+import datetime
+
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+from dateutil.relativedelta import relativedelta
+from kafka import KafkaProducer
+
+import urllib3
+from retry import retry
+from selenium.webdriver.support.wait import WebDriverWait
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
+    '天眼查登录信息']
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+import sys
+# sys.path.append('D:\\KK\\zzsn_spider\\base')
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+cnx_ = baseCore.cnx
+cursor_ = baseCore.cursor
+
+cnx = baseCore.cnx_
+cursor = baseCore.cursor_
+log = baseCore.getLogger()
+
+from classtool import Token, File, Tag
+token = Token()
+file = File()
+tag = Tag()
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'
+
+    # options = webdriver.EdgeOptions()
+    options = {
+        "browserName": "MicrosoftEdge",
+        "ms:edgeOptions": {
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
+        }
+    }
+
+    session = webdriver.Edge(executable_path=path, capabilities=options)
+    return session
+
+# 发送数据
+def sendkafka(post_data):
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
+        kafka_result = producer.send("enterpriseInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
+        print(kafka_result.get(timeout=10))
+    except:
+        exception = 'kafka传输失败'
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
+        log.info(f"{com_name}--{social_code}--kafka传输失败")
+
+# 检查登陆状态
+def checklogin(key):
+
+    t = int(time.time())
+    # url = 'https://www.tianyancha.com/search?key=%E4%B8%AD%E5%9B%BD%E7%9F%B3%E6%B2%B9%E5%8C%96%E5%B7%A5%E9%9B%86%E5%9B%A2%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&sessionNo=1706594186.22975563'
+    url = f'https://www.tianyancha.com/search?key={key}&sessionNo={t}'
+    driver.get(url)
+    time.sleep(2)
+
+    page_source = driver.page_source
+    soup = BeautifulSoup(page_source, 'html.parser')
+    # todo:检查未登录状态
+    # if soup.find('title').text == '会员登录 - 企查查':
+    #     log.info('状态---未登录')
+    #     soup = ''
+    #     return soup
+    return soup
+
+# 合并基本信息和工商信息字段
+def getinfo(dict1,dict2):
+    # 取出两个字典的key值集合
+    keys1 = set(dict1.keys())
+    keys2 = set(dict2.keys())
+
+    # 取出并集
+    union_keys = keys1 | keys2
+    # 根据并集的key值，从两个字典中取出value值，组成新的字典
+    result_dict = {key: dict1.get(key, None) or dict2.get(key, None) for key in union_keys}
+    return result_dict
+
+def dic_handle(result_dic):
+    zxss = ['北京市', '天津市', '上海市', '重庆市']
+    try:
+        company_name = result_dic['企业名称']
+    except:
+        company_name = None
+
+    try:
+        CreditCode = result_dic['统一社会信用代码']
+    except:
+        CreditCode = None
+
+    try:
+        OperName = result_dic['法定代表人']
+    except:
+        OperName = None
+
+    try:
+        PhoneNumber = result_dic['电话']
+    except:
+        PhoneNumber = None
+
+    try:
+        WebSite = result_dic['网址']
+    except:
+        WebSite = None
+
+    try:
+        Email = result_dic['邮箱']
+    except:
+        Email = None
+
+    try:
+        Desc = result_dic['简介']
+    except:
+        Desc = None
+
+    try:
+        Status = result_dic['经营状态']
+    except:
+        try:
+            Status = result_dic['公司现状']
+        except:
+            Status = None
+
+    try:
+        StartDate = result_dic['成立日期']
+    except:
+        StartDate = None
+
+    try:
+        RecCap = result_dic['实缴资本']
+    except:
+        RecCap = None
+
+    try:
+        RegistCapi = result_dic['注册资本']
+    except:
+        RegistCapi = None
+
+    try:
+        CheckDate = result_dic['核准日期']
+    except:
+        CheckDate = None
+
+    try:
+        OrgNo = result_dic['组织机构代码']
+    except:
+        OrgNo = None
+
+    try:
+        No = result_dic['工商注册号']
+    except:
+        No = None
+
+    try:
+        taxpayerNo = result_dic['纳税人识别号']
+    except:
+        taxpayerNo = None
+
+    try:
+        EconKind = result_dic['企业类型']
+    except:
+        EconKind = None
+
+    try:
+        TermStart = result_dic['营业期限'].split('至')[0]
+    except:
+        TermStart = None
+
+    try:
+        TeamEnd = result_dic['营业期限'].split('至')[1]
+    except:
+        TeamEnd = None
+
+    try:
+        TaxpayerType = result_dic['纳税人资质']
+    except:
+        TaxpayerType = None
+
+    try:
+        SubIndustry = result_dic['国标行业']
+    except:
+        SubIndustry = None
+
+    # try:
+    #     region = result_dic['所属地区']
+    # except:
+    #     region = None
+    # try:
+    #     pattern = r'^(.*?省|.*?自治区)?(.*?市|.*?自治州)?(.*?区|.*?县|.*?自治县|.*?市辖区)?(.*?区|.*?县|.*?自治县|.*?市辖区)?$'
+    #     matches = re.match(pattern, region)
+    #     Province = matches.group(1)
+    #     City = matches.group(2)
+    #     County = matches.group(3)
+    #     if Province is None:
+    #         for zxs in zxss:
+    #             if zxs in region:
+    #                 Province = zxs
+    #                 break
+
+    # except:
+    #     Province = None
+    #     City = None
+    #     County = None
+
+    try:
+        BelongOrg = result_dic['登记机关']
+    except:
+        BelongOrg = None
+
+    try:
+        Info = result_dic['人员规模']
+    except:
+        Info = None
+
+    try:
+        can_bao = result_dic['参保人数']
+    except:
+        can_bao = None
+
+    try:
+        OriginalName = result_dic['曾用名']
+    except:
+        OriginalName = None
+
+    try:
+        EnglishName = result_dic['英文名称']
+    except:
+        EnglishName = None
+
+    try:
+        IxCode = result_dic['进出口企业代码']
+    except:
+        IxCode = None
+
+    try:
+        Address = result_dic['地址']
+    except:
+        Address = None
+
+    try:
+        Scope = result_dic['经营范围']
+    except:
+        Scope = None
+
+    aa_dict = {
+        'name': company_name,  # 企业名称
+        'shortName': None,  # 企业简称
+        'socialCreditCode': CreditCode,  # 统一社会信用代码
+        'legalPerson': OperName,  # 法定代表人
+        'officialPhone': PhoneNumber,  # 电话
+        'officialUrl': WebSite,  # 官网
+        'officialEmail': Email,  # 邮箱
+        'briefInfo': Desc,  # 简介
+        'registerStatus': Status,  # 登记状态
+        'incorporationDate': StartDate,  # 成立日期
+        'capital': RegistCapi,  # 注册资本
+        'paidCapital': RecCap,  # 实缴资本
+        'approvalDate': CheckDate,  # 核准日期
+        'organizationCode': OrgNo,  # 组织机构代码
+        'registerNo': No,  # 工商注册号
+        'taxpayerNo': taxpayerNo,  # 纳税人识别号
+        'type': EconKind,  # 企业类型
+        'businessStartDate': TermStart,  # 营业期限自
+        'businessEndDate': TeamEnd,  # 营业期限至
+        'taxpayerQualification': TaxpayerType,  # 纳税人资质
+        'industry': SubIndustry,  # 所属行业
+        'region': None,
+        'province': None,  # 所属省
+        'city': None,  # 所属市
+        'county': None,  # 所属县
+        'registerDepartment': BelongOrg,  # 登记机关
+        'scale': Info,  # 人员规模
+        'insured': can_bao,  # 参保人数
+        'beforeName': OriginalName,  # 曾用名
+        'englishName': EnglishName,  # 英文名
+        'importExportEnterpriseCode': IxCode,  # 进出口企业代码
+        'address': Address,  # 地址
+        'businessRange': Scope,  # 经营范围
+        'status': 0,  # 状态
+    }
+
+    return aa_dict
+
+# 获取基本信息
+def baseinfo(com_soup):
+    baseinfo = com_soup.find('div', class_='index_detail__JSmQM')
+    cominfo_list = baseinfo.find_all('div', class_='index_detail-info-item__oAOqL') #name
+    data = {}
+    for cominfo in cominfo_list:
+        name = cominfo.find('span', class_='index_detail-label__oRf2J').text.replace('：', '').replace(' ', '')
+        # print(name)
+        tag.deletep(cominfo, 'span', 'class', 'index_detail-label__oRf2J')
+        tag.deletep(cominfo, 'i', 'class', 'index_detail-text-desc__myXYK')
+        # print(info)
+        value = cominfo.text.replace('', '').replace('\ue657', '').replace('\ue655', '')
+        if name == '法定代表人':
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
+        if name == '电话':
+            try:
+                value = cominfo.find('span').text
+            except:
+                value = None
+        if name == '邮箱':
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
+        if name == '网址':
+            try:
+                value = cominfo.find('a').text
+            except:
+                value = None
+        if name == '地址':
+            try:
+                value = cominfo.find('span').text
+            except:
+                value = None
+
+        data[name] = value
+        # print("==================")
+    briefTag = baseinfo.find('div', class_='index_detail-linewrap__AKtCa index_-intro__ma3Qd')
+    span_list = briefTag.find_all('span')
+    for span in span_list:
+        if len(span.attrs) == 0:
+            data['简介'] = span.text.split('通过天眼查大数据分析')[0]
+            break
+    return data
+
+# 采集准备
+def redaytowork(com_name, social_code, file_name):
+
+    log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
+    count = 0
+    # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
+    if social_code:
+        soup = checklogin(social_code)
+    else:
+        soup = checklogin(com_name)
+    if not soup:
+        log.info("登录失效===重新放入redis")
+        baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+        # token.updateTokeen(id_cookie,2)
+        # log.info('=====已重新放入redis,失效cookies已删除======')
+        time.sleep(20)
+        return count
+    else:
+        try:
+            searchinfo = soup.find('div', class_='index_content-tool-title__K1Z6C').find('span', class_='index_title-count__lDSjB').text
+        except:
+            try:
+                # todo:可能是搜不到该企业
+                errormessage = soup.find('div', class_='index_no-data-reason-title__V3gFY').text
+                if '抱歉' in errormessage:
+                    log.info('=====搜索不到该企业====')
+                    data = [com_name, social_code]
+                    # todo:搜不到的企业需要返回到一个表格中
+                    file.appenddata(file_name, '需处理企业', data)
+                    return count
+            except:
+                log.info("登录失效===重新放入redis")
+                baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+                # token.updateTokeen(id_cookie,2)
+                log.info('=====已重新放入redis,cookies已封号======')
+                time.sleep(20)
+                return count
+        # 开始采集
+        try:
+            if spiderwork(soup, com_name, file_name):
+                count += 1
+                log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
+                token.updateTokeen(id_cookie,3)
+                return count
+            else:
+                return count
+        except Exception as e:
+            log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
+            baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+            # token.updateTokeen(id_cookie,2)
+            log.info('=====已重新放入redis,cookies已封号======')
+            return count
+
+def ifbeforename(company_url):
+    driver.get(company_url)
+    time.sleep(2)
+    com_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    try:
+        businessinfo = com_soup.find('table', {'class': 'index_tableBox__ZadJW'})
+    except:
+        businessinfo = ''
+    if businessinfo:
+        try:
+            name = businessinfo.find('span', class_='index_history-gray-tags__o8mkl').text
+            value = businessinfo.find('span', class_='index_copy-text__ri7W6').text.replace('展开', '').replace(' ', '').replace('…','').replace('\n', '').replace('复制', '').split('（')[0]
+        except:
+            name = '曾用名'
+            value = ''
+        return value
+    else:
+        return ''
+
+#解析时间
+def paserTime(publishtime):
+    timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
+    current_datetime = datetime.datetime.now()
+    publishtime = publishtime.strip()
+    print(publishtime)
+    try:
+        if '年前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=365 * day)
+            publishtime = current_datetime - delta
+        elif '月前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            publishtime = current_datetime - relativedelta(months=day)
+            # publishtime = current_datetime - delta
+        elif '周前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(weeks=day)
+            publishtime = current_datetime - delta
+        elif '天前' in publishtime:
+            numbers = re.findall(r'\d+', publishtime)
+            day = int(numbers[0])
+            delta = datetime.timedelta(days=day)
+            publishtime = current_datetime - delta
+        elif '前天' in publishtime:
+            delta = datetime.timedelta(days=2)
+            publishtime = current_datetime - delta
+        elif '昨天' in publishtime:
+            current_datetime = datetime.datetime.now()
+            delta = datetime.timedelta(days=1)
+            publishtime = current_datetime - delta
+        elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
+            if '小时' in publishtime:
+                hour = publishtime.split("小时")[0]
+            else:
+                hour = 0
+            if hour != 0:
+                min = publishtime.split("小时")[1].split("分钟")[0]
+            else:
+                min = publishtime.split("分钟")[0]
+            delta = datetime.timedelta(hours=int(hour), minutes=int(min))
+            publishtime = current_datetime - delta
+        elif '年' in publishtime and '月' in publishtime:
+            time_format = '%Y年%m月%d日'
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+        elif '月' in publishtime and '日' in publishtime:
+            current_year = current_datetime.year
+            time_format = '%Y年%m月%d日'
+            publishtime = str(current_year) + '年' + publishtime
+            publishtime = datetime.datetime.strptime(publishtime, time_format)
+    except Exception as e:
+        print('时间解析异常！！')
+    return publishtime
+
+@retry(tries=2,delay=3)
+def getBusinessinfo(com_soup):
+    com_soup_ = com_soup.find('div',attrs={'data-dim':'baseInfo'})
+    businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW'})
+    if not businessinfo:
+        businessinfo = com_soup_.find('table', {'class': 'index_tableBox__ZadJW '})
+    if not businessinfo:
+        raise RuntimeError('工商信息未找到')
+    return businessinfo
+
+# 采集基本信息和工商信息
+def spiderinfo(company_url, receptname, file_name):
+
+    qccid = company_url.split('company/')[1]
+    log.info(f'====={qccid}=====')
+    driver.get(company_url)
+    # req_ = s.get(headers=headers, url=company_url)
+    page_source_detail = driver.page_source
+    com_soup = BeautifulSoup(page_source_detail, 'html.parser')
+    #todo:天眼查更新时间 正常请求不到 需要使用模拟浏览器
+    try:
+        sourceUpdateTime_ = com_soup.find('div', class_='index_detail-refresh__6W7U4').find('span').text
+        pattern = r'\d{4}-\d{2}-\d{2}'
+        matched = re.findall(pattern, sourceUpdateTime_)
+        if matched:
+            sourceUpdateTime = sourceUpdateTime_
+        else:
+            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
+    except:
+        log.info(f'天眼查无该企业{social_code}')
+        return
+
+    try:
+        businessinfo = getBusinessinfo(com_soup)
+    except:
+        businessinfo = ''
+    if businessinfo:
+        data_baseinfo = baseinfo(com_soup)
+        # print(data_baseinfo)
+        tr_list = businessinfo.find_all('tr')
+        dic_buseniss = {}
+        for tr in tr_list:
+            # td_count = len(tr.find_all('td'))
+            # print(td_count)
+            td_list = tr.find_all('td')
+            td_count = len(td_list)
+            name_list = [td_list[i].text for i in range(td_count) if i % 2 == 0]
+            # print(name_list)
+            # value_list = [td_list[i].text for i in range(td_count) if i % 2 != 0]
+            value_list = []
+            for i in range(td_count):
+                if i % 2 != 0:
+                    value_tag = td_list[i]
+                    # print(value_tag)
+                    # print("==============")
+                    tag.deletep(value_tag, 'span', 'class', 'index_history-operate__t3kjv')
+                    tag.deletep(value_tag, 'div', 'class', '_efcb8')
+                    tag.deletep(value_tag, 'span', 'class', 'index_legal-bottom-info__bYvYZ')
+                    tag.deletep(value_tag, 'a', 'class', 'ml8 link-click')
+                    tag.deletep(value_tag, 'span', 'class', 'index_report-jump__z__UW')
+                    tag.deletep(value_tag, 'span', 'class', 'index_branch-report__Nyf_Y')
+                    # for value_tag in value_tag_list:
+                    value_list.append(value_tag.text.replace('\xa0', ''))
+            # print(value_list)
+            if len(name_list) == len(value_list):
+                for i in range(len(name_list)):
+                    dic_buseniss[name_list[i]] = value_list[i]
+                    if '曾用名' in value_list[i]:
+                        dic_buseniss['曾用名'] = value_list[i].split('曾用名')[1].split('更多')[0]
+                        dic_buseniss[name_list[i]] = value_list[i].split('曾用名')[0]
+                    if name_list[i] == '法定代表人':
+                        value_list[i] = value_list[i].split('任职')[0]
+                        dic_buseniss[name_list[i]] = value_list[i]
+        try:
+            del dic_buseniss['天眼评分']
+        except:
+            pass
+        # print(dic_buseniss)
+        result_dict = getinfo(dic_buseniss, data_baseinfo)
+        # 主要针对香港台湾企业，社会信用代码传为给定的
+        try:
+            result_dict['统一社会信用代码']
+        except:
+            # log.info('未获取到统一社会信用代码')
+            if social_code:
+                result_dict['统一社会信用代码'] = social_code
+            else:
+                # 如果未给定社会信用代码，则返回
+                return False
+        if result_dict['企业名称'].startswith('(') and result_dict['企业名称'].endswith(')'):
+            result_dict['企业名称'] = result_dict['企业名称'][1:-1]
+        if result_dict['企业名称'] == '-' and com_name:
+            result_dict['企业名称'] = com_name
+        elif not com_name:
+            return False
+        else:
+            pass
+        # print(result_dict)
+        # 采集成功的企业
+        data = [com_name, result_dict['企业名称'], social_code, result_dict['统一社会信用代码']]
+        file.appenddata(file_name, '获取基本信息成功企业', data)
+        # 将字段转化成英文驼峰
+        aa_dic = dic_handle(result_dict)
+        aa_dic['sourceUpdateTime'] = sourceUpdateTime
+        aa_dic['qccId'] = qccid
+
+        log.info(aa_dic)
+        # sendkafka(aa_dic)
+        # print(aa_dic)
+        # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # dic_info = json.dumps(aa_dic)
+        # req = requests.post(post_url, data=dic_info)
+
+    else:
+        # todo: 重新放入redis
+        baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+        log.error(f'未找到工商信息，重新塞入redis')
+    token.updateTokeen(id_cookie, 3)
+
+        # data_baseinfo = baseinfo(com_soup)
+        # # 主要针对香港台湾企业，社会信用代码传为给定的
+        # try:
+        #     data_baseinfo['统一社会信用代码']
+        # except:
+        #     log.info('未获取到统一社会信用代码')
+        #     if social_code:
+        #         data_baseinfo['统一社会信用代码'] = social_code
+        #     else:
+        #         # 如果未给定社会信用代码，则返回
+        #         return False
+        # if data_baseinfo['企业名称'].startswith('(') and data_baseinfo['企业名称'].endswith(')'):
+        #     data_baseinfo['企业名称'] = data_baseinfo['企业名称'][1:-1]
+        # if data_baseinfo['企业名称'] == '-' and com_name:
+        #     data_baseinfo['企业名称'] = com_name
+        # elif not com_name:
+        #     return False
+        # else:
+        #     pass
+        # # 采集成功的企业
+        # data = [com_name, data_baseinfo['企业名称'], social_code, data_baseinfo['统一社会信用代码']]
+        # file.appenddata(file_name, '获取基本信息成功企业', data)
+        # # 将字段转化成英文驼峰
+        # aa_dic = dic_handle(data_baseinfo)
+        # aa_dic['sourceUpdateTime'] = sourceUpdateTime
+        # aa_dic['qccId'] = qccid
+        # # sendkafka(aa_dic)
+        # log.info(aa_dic)
+        # # post_url = 'http://192.168.1.41:8088/enterprise/check/judge'
+        # # dic_info = json.dumps(aa_dic)
+        # # req = requests.post(post_url, data=dic_info)
+
+def remove_parentheses(text):
+    # 清除中文小括号
+    text = re.sub(r'（|）', '', text)
+    # 清除英文小括号
+    text = re.sub(r'\(|\)', '', text)
+    return text.replace(' ', '')
+
+# 判断名称是否统一
+def spiderwork(soup, receptname, file_name):
+    company_url = ''
+    try:
+        company_list = soup.find_all('div', class_='index_search-box__7YVh6')
+    except:
+        log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
+        baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+        # token.updateTokeen(id_cookie,2)
+        log.info('=====已重新放入redis,cookies已封号======')
+        return False
+
+    # receptname = '小米通讯技术有限公司'
+    for compamy in company_list:
+        info_t = compamy.find('div', class_='index_name__qEdWi')
+        getname = info_t.find('span').text
+        log.info(f'接收到的企业名称--{receptname}---采到的企业名称--{getname}')
+        if receptname and getname == receptname:
+            company_url = info_t.find('a')['href']
+            break
+        elif not receptname:
+            company_url = info_t.find('a')['href']
+            break
+        else:
+            jian_name = remove_parentheses(baseCore.hant_2_hans(getname))
+            if remove_parentheses(receptname) == jian_name:
+                log.info(f'接收到的企业名称--{receptname}---转化成简体字的企业名称--{jian_name}')
+                company_url = info_t.find('a')['href']
+                break
+            else:
+                continue
+    if company_url:
+        # 采集基本信息和工商信息
+        spiderinfo(company_url, receptname, file_name)
+    else:
+        # 判断是否是曾用名
+        getname = ''
+        for child in company_list[0].find_all():
+            if child.has_attr('class'):
+                # print(child['class'])
+                if 'index_name' in child['class'][0]:
+                    getname = child.text
+                    company_url = child.find('a')['href']
+                    break
+        # tr = company_list[:1][0]
+        # info_t = tr.find('div', class_='index_name__qEdWi')
+        # getname = info_t.find('span').text
+        if getname:
+            log.info(f'------可能是曾用名------接收到的企业名称--{receptname}---采到的企业名称--{getname}')
+            beforename = ifbeforename(company_url)
+            if beforename == receptname:
+                spiderinfo(company_url, receptname, file_name)
+            else:
+                # 没有搜到相同的企业名称
+                data = [com_name, social_code]
+                file.appenddata(file_name, '需处理企业', data)
+                time.sleep(2)
+                return False
+        else:
+            # 没有搜到相同的企业名称
+            data = [com_name, social_code]
+            file.appenddata(file_name, '需处理企业', data)
+            time.sleep(2)
+            return False
+    return True
+
+def login():
+
+    # time.sleep(10)
+    cookies_list, id_cookie, user_name = token.get_cookies()
+    log.info(f'=====当前使用的是{user_name}的cookie======')
+    for cookie in cookies_list:
+        driver.add_cookie(cookie)
+    time.sleep(5)
+    driver.refresh()
+    # url_test = 'https://www.qcc.com/firm/a5f5bb3776867b3e273cd034d6fb4baa.html'
+    # driver.get(url_test)
+    # # driver.get('https://www.qcc.com/')
+    time.sleep(5)
+    return driver,id_cookie
+
+if __name__ == '__main__':
+    taskType = '基本信息/天眼查'
+    # driver = create_driver()
+    # #手动登录
+    # driver.get('https://www.tianyancha.com/')
+    #todo:绕过验证使用cookies登录
+    # driver, id_cookie = login()
+
+    driver = create_driver()
+    url = 'https://www.tianyancha.com/'
+    driver.get(url)
+    driver.maximize_window()
+    while True:
+        # todo:绕过验证使用cookies登录
+        driver, id_cookie = login()
+        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
+        file_name = f'./data/国内企业基本信息更新.xlsx'
+        file.createFile(file_name)
+        start_time = time.time()
+        # 获取企业信息
+        # company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
+        company_field = '91330000742906207U|浙江我武生物科技股份有限公司'
+
+        if company_field == 'end':
+            # 本轮处理完毕，需要发送邮件，并且进入下一轮
+            baseCore.sendEmail(file_name)
+            time.sleep(20)
+            file.deleteFile(file_name)
+            continue
+
+        if company_field == '' or company_field is None:
+            # 本轮结束后没有新增的企业要采集
+            file.deleteFile(file_name)
+            flag = True
+            while flag:
+                log.info('--------已没有数据---------')
+                time.sleep(30)
+                if not baseCore.check_mysql_conn(cnx_):
+                    # 144数据库
+                    cnx_ = baseCore.cnx
+                    cursor_ = cnx_.cursor()
+                    log.info('===11数据库重新连接成功===')
+                company_field = baseCore.redicPullData('UpdateBasdeInfo:SocialCode_CompanyName')
+                if company_field:
+                    flag = False
+                    log.info("-----已添加数据------")
+                    baseCore.r.lpush('UpdateBasdeInfo:SocialCode_CompanyName', company_field)
+                    continue
+            continue
+        # company_field_ = f'|{company_field}'
+        social_code = company_field.split('|')[0]
+        com_name = company_field.split('|')[1].replace(' ', '')
+
+        if 'ZZSN' in social_code and 'ZD' in social_code:
+            continue
+
+        #todo:查询天眼查id
+        data = baseCore.getInfomation(social_code)
+        if len(data) != 0:
+            tycid = data[11]
+        else:
+            # 数据重新塞入redis
+            # log.info(f'数据库中无该企业{social_code}')
+            sql = f"SELECT * FROM sys_base_enterprise WHERE social_credit_code = '{social_code}'"
+            cursor.execute(sql)
+            data = cursor.fetchone()
+            if data:
+                pass
+            else:
+                # 数据库中并没有该企业 需要新增
+                pass
+            com_name_c = data[3]
+            xydm = data[1]
+            # 写入数据库
+            insert = "INSERT INTO EnterpriseInfo(CompanyName, SocialCode) VALUES (%s, %s)"
+            cursor_.execute(insert, (com_name_c, xydm))
+            cnx_.commit()
+            tycid = ''
+        if tycid == None or tycid == '':
+            count = redaytowork(com_name, social_code, file_name)
+        else:
+            company_url = 'https://www.tianyancha.com/company/' + tycid
+            spiderinfo(company_url, com_name, file_name)
+        time.sleep(10)
+        break
+    baseCore.close()
\ No newline at end of file
--- a/comData/Tyc/baseinfouptime_tyc.py
+++ b/comData/Tyc/baseinfouptime_tyc.py
@@ -223,7 +223,9 @@ def spiderinfo(company_url, receptname, file_name):
        else:
            sourceUpdateTime = paserTime(sourceUpdateTime_).strftime("%Y-%m-%d %H:%M:%S")
    except:
-        redaytowork(com_name, social_code, file_name)
+        log.info(f'天眼查无该企业{social_code}')
+        return
+
    aa_dict = {
        'name': receptname,  # 企业名称
        'shortName': None,  # 企业简称
@@ -326,7 +328,7 @@ if __name__ == '__main__':
    driver.get('https://www.tianyancha.com/')
    while True:
        nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
-        file_name = f'./data/国内企业基本信息采集情况.xlsx'
+        file_name = f'./data/国内企业基本信息更新.xlsx'
        file.createFile(file_name)
        # cookies_list, id_cookie = token.get_cookies()
        # cookies = {}
@@ -336,8 +338,8 @@ if __name__ == '__main__':
        # s.cookies.update(cookies)
        start_time = time.time()
        # 获取企业信息
-        # company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
-        company_field = '913100006073602992|光明乳业股份有限公司'
+        company_field = baseCore.redicPullData('BaseInfoEnterpriseUptime:gnqy_socialCode')
+        # company_field = '913100006073602992|光明乳业股份有限公司'

        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
@@ -398,7 +400,7 @@ if __name__ == '__main__':
            count = redaytowork(com_name, social_code, file_name)
        else:
            company_url = 'https://www.tianyancha.com/company/' + tycid
-            spiderinfo(company_url, social_code, file_name)
+            spiderinfo(company_url, com_name, file_name)
        time.sleep(10)
        # break
    baseCore.close()
\ No newline at end of file
--- a/comData/Tyc/classtool.py
+++ b/comData/Tyc/classtool.py
@@ -59,7 +59,8 @@ class Token():
        result = db_storage.find_one(query, sort=[('updateTime', 1)])
        cookies = result['cookies']
        id_token = result['_id']
-        return cookies, id_token
+        user_name = result['name']
+        return cookies, id_token, user_name

    # 删除失效的token
    def delete_token(self, cookie_):

--- a/comData/YanBao/aaaa.py
+++ b/comData/YanBao/aaaa.py
+"""
+从es中拿到所有的标题
+"""
+import redis
+from elasticsearch import Elasticsearch
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'researchreportdata'
+
+    def queryatt(self,index_name):
+       body = {
+           "query": {
+               "bool": {
+                   "must": [
+                       {
+                           "nested": {
+                               "path": "labels",
+                               "query": {
+                                   "match": {
+                                       "labels.relationId": "91330000747735638J"
+                                   }
+                               }
+                           }
+                       },
+                       {
+                           "range": {
+                               "createDate": {
+                                   "gte": "2024-02-26T13:00:00",
+                                   "lte": "2024-02-27T00:00:00"
+                               }
+                           }
+                       },
+                       {
+                           "term": {
+                               "type.keyword": {
+                                   "value": "3"
+                               }
+                           }
+                       }
+                   ]
+               }
+           },
+           "sort": [
+               {
+                   "createDate": {
+                       "order": "desc"
+                   }
+               }
+           ],
+           "track_total_hits": True,
+           "size": 100
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.origin',
+                      'hits.hits._source.publishDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+
+
+
+if __name__ == '__main__':
+    es_method = EsMethod()
+    # 连接Redis
+    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+
+    result = es_method.queryatt('researchreportdata')
+    total = result['hits']['total']['value']
+    try:
+        msglist = result['hits']['hits']
+    except:
+       log.info(f'error-----{result}')
+
+    log.info(f'---第1页{len(msglist)}条数据----共{total}条数据----')
+
+    for mms in msglist:
+       id = mms['_id']
+       title = mms['_source']['title']
+       origin = mms['_source']['origin']
+       pub_time = mms['_source']['publishDate']
+       try:
+           log.info(f'{id}--{title}--{origin}--')
+           item = id + "|" + title
+           # r.lrem(f'XJPdatabase:id_2', 0, item)
+           r.lpush(f'91330000747735638J:id', item)
+       except:
+           continue
--- a/comData/YanBao/deletebyid.py
+++ b/comData/YanBao/deletebyid.py
@@ -31,7 +31,7 @@ class EsMethod(object):
    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'],  http_auth=('elastic', 'zzsn9988'),timeout=300 )
-        self.index_name='researchreportdata'
+        self.index_name='researchreportdata_2024'

    '''
    删除
@@ -47,13 +47,13 @@ if __name__ == "__main__":
    redis_conn = redis.Redis(connection_pool=pool)
    while True:
        # 从redis中读取数据，去附件表中根据title查询，更新查到的附件id
-        item = redis_conn.lpop('YanBao:id')
+        item = redis_conn.lpop('91330000747735638J:id')
        if item:
            log.info(item)
-            id = item.decode()
+            id = int(item.decode().split('|')[0])
            try:
                esMethod.delete(esMethod.index_name,id)
-            except:
+            except Exception as e:
                continue
        else:
            log.info('已删除完毕')

--- a/comData/ZW_paper/5.py
+++ b/comData/ZW_paper/5.py
@@ -51,7 +51,7 @@ def parse_excel():
 def get_content1():
    print_result_list = []
    result_dict_list = []
-    # query = {"专家库主键id":"1204"}
+    # query = {"专家库主键id":"141"}
    # for db_dict in db_storage.find(query):
    for db_dict in db_storage.find():
        del db_dict['_id']

--- a/comData/dingzhi/zzcx.py
+++ b/comData/dingzhi/zzcx.py
@@ -2,8 +2,12 @@
 中证智能财讯
 """
 import json
+import os
 import sys
 import time
+
+import redis
+from kafka import KafkaProducer
 from obs import ObsClient
 import fitz
 import requests
@@ -11,6 +15,10 @@ from bs4 import BeautifulSoup
 from retry import retry
 from selenium.webdriver.common.by import By
 from selenium import webdriver
+from tempfile import NamedTemporaryFile
+
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 sys.path.append('D:\\kkwork\\zzsn_spider\\base')
 import BaseCore
 baseCore = BaseCore.BaseCore()
@@ -36,68 +44,14 @@ def create_driver():

 @retry(tries=3, delay=1)
 def getOBSres(pathType, name, response):
-    result = obsClient.putContent('zzsn', f'{pathType}/' + name, content=response.content)
-    # result = obsClient.putFile('zzsn', pathType+name, file_path=response)
+    result = obsClient.putFile('zzsn', pathType+name, file_path=response)
    return result

-def uptoOBS(pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by):
-        headers = {}
-        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
-                   'full_path': '',
-                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
-                   'create_time': '', 'page_size': '', 'content': ''}
-        headers['User-Agent'] = baseCore.getRandomUserAgent()
-        for i in range(0, 3):
-            try:
-                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
-                file_size = int(response.headers.get('Content-Length'))
-                break
-            except:
-                time.sleep(3)
-                continue
-        page_size = 0
-        name = str(baseCore.getuuid()) + '.pdf'
-        now_time = time.strftime("%Y-%m")
-        try:
-            result = getOBSres(pathType, now_time, name, response)
-        except:
-            log = baseCore.getLogger()
-            log.error(f'OBS发送失败')
-            return retData
-        try:
-            with fitz.open(stream=response.content, filetype='pdf') as doc:
-                page_size = doc.page_count
-                for page in doc.pages():
-                    retData['content'] += page.get_text()
-        except:
-            log = baseCore.getLogger()
-            log.error(f'文件损坏')
-            return retData
-
-        if page_size < 1:
-            # pdf解析失败
-            # print(f'======pdf解析失败=====')
-            return retData
-        else:
-            try:
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                retData['state'] = True
-                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = result['body']['objectUrl']
-                retData['file_size'] = baseCore.convert_size(file_size)
-                retData['create_time'] = time_now
-                retData['page_size'] = page_size
-            except Exception as e:
-                state = 0
-                takeTime = baseCore.getTimeCost(start_time, time.time())
-                baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
-                return retData
-
-            return retData
-
 def zzcx():
+    driver = create_driver()
+    driver.maximize_window()
    url = 'https://zzcx.cs.com.cn/dist/publishManuscript/listES'
-    payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
+    # payload = {"pageNo": 1, "pageSize": 15, "statusList": [0], "keyword": ""}
    headers = {
        'Accept': 'application/json',
        'Accept-Encoding': 'gzip, deflate, br',
@@ -115,11 +69,12 @@ def zzcx():
        'Origin': 'https://zzcx.cs.com.cn',
        'Referer': 'https://zzcx.cs.com.cn/app/zzb/list?spm=0.0.0.0.wjnSUZ'
    }
-    payload = json.dumps(payload)
-    result_json = requests.post(url=url, data=payload, headers=headers).json()
-    print(result_json)
-    pages = result_json['data']['pages']
-    for page in range(1, int(pages + 1)):
+    # payload = json.dumps(payload)
+    # result_json = requests.post(url=url, data=payload, headers=headers).json()
+    # print(result_json)
+    # pages = result_json['data']['pages']
+    pages = 5
+    for page in range(1, int(pages) + 1):
        payload_page = {"pageNo": page, "pageSize": 15, "statusList": [0], "keyword": ""}
        payload_page = json.dumps(payload_page)
        datas = requests.post(url=url, data=payload_page, headers=headers)
@@ -127,24 +82,135 @@ def zzcx():
        for news in records:
            title = news['title']
            news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=' + news['manuscriptId']
+            # news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=3ec65751b63e40d7813a0c6bbe9b3135'

+            try:
+                flag = r.sismember('IN-20240129-0001', news_url)
+                if flag:
+                    log.info('信息已采集入库过')
+                    continue
+            except Exception as e:
+                continue
+            # news_url = 'https://zzcx.cs.com.cn/app/zzb/detail?id=2eeeb171e36b42ada02dad77b80038b1'
            # 使用模拟浏览器打开
-            driver = create_driver()
+
            driver.get(news_url)
-            div_ = driver.find_element(By.ID, 'line')
-            div = div_.find_element(By.XPATH, '..')
-            image_data = div.screenshot_as_base64
+
+            div_photo = driver.find_elements(By.ID, 'line')
+            for png_ in div_photo:
+                try:
+                    div = png_.find_element(By.XPATH, './/div/div[1]/div')
+                    # div = png_.find_element(By.CLASS_NAME, 'ant-col ant-col-17')
+                    # todo:滚轮需要滑动
+                    driver.execute_script("arguments[0].scrollIntoView();", div)
+                    time.sleep(1)
+                    #todo:保存成临时文件
+                    temp_file =NamedTemporaryFile(delete=False, suffix=".png")
+                    temp_file.close()
+                    div.screenshot(temp_file.name)
+                    file_path = temp_file.name
                    # todo:保存到obs链接及标签替换
-            baseCore.uptoOBS()
-            html = driver.page_source
+                    name = str(baseCore.getuuid())
+                    result = getOBSres(pathType, name, file_path)
+                    path = result['body']['objectUrl'].split('.com')[1]
+                    full_path = result['body']['objectUrl']
+
+                    #todo:替换标签 删除标签
+                    dele_tag = png_.find_element(By.XPATH, './/div/div[1]//div')
+                    driver.execute_script("arguments[0].remove()", dele_tag)
+
+                    #todo:将图片塞进去 新建一个new_tag
+                    append_tag = png_.find_element(By.XPATH, './/div/div[1]')
+                    driver.execute_script(
+                        "var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
+                        append_tag)
+                    os.remove(file_path)
+                except:
+                    continue
+
+            # div_undefined_line = driver.find_elements(By.ID, 'k-line-undefined')
+            div_undefined_line = driver.find_elements(By.ID, 'KLineSubscription')
+            for u_png in div_undefined_line:
+                div_u = u_png.find_element(By.XPATH, './/div')
+                # todo:滚轮需要滑动
+                driver.execute_script("arguments[0].scrollIntoView();", div_u)
+                time.sleep(3)
+                # todo:保存成临时文件
+                temp_file = NamedTemporaryFile(delete=False, suffix=".png")
+                temp_file.close()
+                div_u.screenshot(temp_file.name)
+                file_path = temp_file.name
+                # todo:保存到obs链接及标签替换
+                name = str(baseCore.getuuid())
+                result = getOBSres(pathType, name, file_path)
+                path = result['body']['objectUrl'].split('.com')[1]
+                full_path = result['body']['objectUrl']
+
+                # todo:替换标签 删除标签
+                dele_tag = u_png.find_element(By.XPATH, './/div')
+                driver.execute_script("arguments[0].remove()", dele_tag)
+
+                # todo:将图片塞进去 新建一个new_tag
+                # append_tag = u_png.find_element(By.XPATH, './/div')
+                driver.execute_script(
+                    "var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
+                    u_png)
+                os.remove(file_path)
+
+            div_line_bar = driver.find_elements(By.ID, 'bar-line-bar-line')
+            for lin_bar_tag in div_line_bar:
+                line_bars = lin_bar_tag.find_elements(By.XPATH, './/div[contains(@class, "ant-col-11")]')
+                for line_bar in line_bars:
+                    photo_line_bar = line_bar.find_element(By.XPATH, './/div')
+                    # todo:滚轮需要滑动
+                    driver.execute_script("arguments[0].scrollIntoView();", photo_line_bar)
+                    time.sleep(1)
+                    # todo:保存成临时文件
+                    temp_file = NamedTemporaryFile(delete=False, suffix=".png")
+                    temp_file.close()
+                    photo_line_bar.screenshot(temp_file.name)
+                    file_path = temp_file.name
+                    # todo:保存到obs链接及标签替换
+                    name = str(baseCore.getuuid())
+                    result = getOBSres(pathType, name, file_path)
+                    path = result['body']['objectUrl'].split('.com')[1]
+                    full_path = result['body']['objectUrl']

-            news_req = requests.get(url=news_url, headers=headers)
-            news_soup = BeautifulSoup(news_req.content, 'html.parser')
+                    # todo:替换标签 删除标签
+                    dele_tag_ = line_bar.find_element(By.XPATH, './/div')
+                    driver.execute_script("arguments[0].remove()", dele_tag_)
+
+                    # todo:将图片塞进去 新建一个new_tag
+                    driver.execute_script(
+                        "var newElement = document.createElement('img'); newElement.src = 'http://zzsn.luyuen.com" + path + "'; newElement.style.width = '50%'; newElement.style.position = 'relative'; newElement.style.float = 'left'; arguments[0].insertBefore(newElement, arguments[0].firstChild);",
+                        line_bar)
+
+                    # #todo:创建清晰的图片标签
+                    # driver.execute_script(f"""
+                    #     var img = new Image();
+                    #     img.src = "http://zzsn.luyuen.com{path}";  // 替换为你的图片路径
+                    #     img.onload = function() {{
+                    #         var canvas = document.createElement("canvas");
+                    #         canvas.width = img.width;
+                    #         canvas.height = img.height;
+                    #         var ctx = canvas.getContext("2d");
+                    #         ctx.drawImage(img, 0, 0);
+                    #         document.body.appendChild(canvas);
+                    #     }}; arguments[0].insertBefore(img, arguments[0].firstChild);
+                    # """, line_bar)
+                    os.remove(file_path)
+            html = driver.page_source
+            news_soup = BeautifulSoup(html, 'html.parser')
            detail_info = news_soup.find('div', class_='subTitle___svblj')
            div_list = detail_info.find_all('div')
            origin = div_list[0].text
            publishDate = div_list[1].text
            contentWithTag = news_soup.find('div', class_='editable___1EtCQ editor-editable')
+
+            # print(contentWithTag)
+            for tag in contentWithTag.find_all('span'):
+                if tag.text == '\ufeff':
+                    tag.decompose()
            content = contentWithTag.text
            info_code = 'IN-20240129-0001'
            result_dict = {
@@ -152,25 +218,30 @@ def zzcx():
                'sid': '1751787750127857666',
                'title': title,
                'organ': origin,
-                'origin': '国务院国有资产监督管理委员会',
+                'origin': origin,
                # '摘要': zhaiyao,
                'source': 16,
                'content': content,
-                'contentWithTag': contentWithTag,
+                'contentWithTag': str(contentWithTag),
                'publishDate': publishDate,
                'sourceAddress': news_url,
            }
-            log.info(f'{page}--{title}--{href}')
-            # info_list.append(result_dict)
+            log.info(f'{page}--{title}--{news_url}')
+            print(result_dict)
+        #     break
+        # break
            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
            try:
                kafka_result = producer.send("crawlerInfo",
                                             json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
-                r.sadd(info_code + '-test', href)
+                r.sadd(info_code, news_url)
                log.info('发送kafka成功！')
            except Exception as e:
                log.info(e)
            finally:
                producer.close()
+
 if __name__ == "__main__":
+    pathType = 'PhotoDingzhi/'
+    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
    zzcx()
\ No newline at end of file
--- a/comData/important_meeting/zyqmshggldxzhy19.py
+++ b/comData/important_meeting/zyqmshggldxzhy19.py
 # 中央全面深化改革委员会会议
 import json
+import sys
 import time

+import redis
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime

 from kafka import KafkaProducer

-headers = {
+sys.path.append('D:\\kkwork\\zzsn_spider\\base')
+import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
@@ -26,22 +32,50 @@ headers = {
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
 }
-
+headers = {
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
+                'Connection': 'keep-alive',
+                'Cookie': 'cna=HcAKHtgXUG4CAQHBO1G6ZJYK',
+                'Host': 'news.12371.cn',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'none',
+                'Sec-Fetch-User': '?1',
+                'Upgrade-Insecure-Requests': '1',
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
+                'sec-ch-ua': '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
+                'sec-ch-ua-mobile': '?0',
+                'sec-ch-ua-platform': '"Windows"'
+                        }

 if __name__ == "__main__":
        # 中央全面深化改革委员会会议
-
+        r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
        # 中央全面深化改革领导小组会议
        # url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/', 'https://www.12371.cn/special/zyqmshggldxzhy19/']
-        url_list = ['https://www.12371.cn/special/zyqmshggldxzhy19/']
-        for url in url_list:
-                request = requests.get(url=url, headers=headers)
+        url = 'https://www.12371.cn/special/zyqmshggldxzhy19/'
+
+        request = requests.get(url=url, headers=header)
        soup = BeautifulSoup(request.content, 'html.parser')
+        # print(soup)
        request.encoding = request.apparent_encoding
        # print(soup)
-                info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
-                ul_list = info_html.find_all('li')
-                for ul in ul_list:
+        # info_html = soup.find('div', id='SUBD1663831285709121').find('ul', class_='ul_list')
+        info_html_list = soup.find_all('div', class_='dyw1023_right_list01 hyty')
+        flag = 1
+        for info_html in info_html_list:
+
+                if flag == 1:
+                        info_code = 'IN-20230816-0004'
+                        sid = '1691633319715676162'
+                else:
+                        sid = '1691633869186277378'
+                        info_code = 'IN-20230816-0005'
+
+                ul_list = info_html.find('ul', class_='ul_list').find_all('li')
+                for ul in ul_list[::-1]:
                        publishDate_ = str(ul.find('span').text)
                        date_obj= datetime.strptime(publishDate_, "%Y年%m月%d日")
                        publishDate = date_obj.strftime('%Y-%m-%d')
@@ -51,18 +85,27 @@ if __name__ == "__main__":
                        newsUrl = ul.find('a')['href']
                        summary = ul.find('a').text
                        # todo: 链接判重
-                        news_request = requests.get(url=newsUrl, headers=headers)
+                        try:
+                                flag = r.sismember(info_code, newsUrl)
+                                if flag:
+                                        log.info('信息已采集入库过')
+                                        continue
+                        except Exception as e:
+                                continue
+                        news_request = requests.get(url=newsUrl, headers=headers, allow_redirects=False)
                        news_soup = BeautifulSoup(news_request.content, 'html.parser')
-                        print(news_soup)
+                        # print(news_soup)
+                        try:
                                title = news_soup.find('h1', class_='big_title').text
                                source = news_soup.find('div', class_='title_bottom').find('i').text
                                contentwithTag = news_soup.find('div', class_='word')
                                content = contentwithTag.text
-                        if url == 'https://www.12371.cn/special/zyqmshggldxzhy19/':
-                                sid = '1691633319715676162'
-                        else:
-                                sid = '1691633869186277378'
+                        except Exception as e:
+                                 log.error(f'解析网页出错{newsUrl}')
+                                 continue
+
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+
                        dic_info ={
                                'id': '1681549361661489154' + str(int(time.time()*1000)),
                                'title': title,
@@ -79,6 +122,7 @@ if __name__ == "__main__":
                                'createDate': time_now,

                        }
+                        r.sadd(info_code, newsUrl)
                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                        try:
                                kafka_result = producer.send("research_center_fourth",
@@ -90,3 +134,4 @@ if __name__ == "__main__":
                                print('发送kafka异常！')
                        finally:
                                producer.close()
+                flag += 1
\ No newline at end of file
--- a/comData/leadership/leadership.py
+++ b/comData/leadership/leadership.py
@@ -174,6 +174,76 @@ def zyzsjg():
        # sendKafka(data)
        print(data)

+def dfrwk():
+    datas_df = []
+    url_df = 'http://district.ce.cn/zt/rwk/'
+    req = requests.get(url=url_df, headers=headers)
+    soup = BeautifulSoup(req.content, 'html.parser')
+    df_list = soup.find('div', class_='left1').find_all('div')
+    for df in df_list:
+        df_place = df.text.replace('\n', '')
+        try:
+            df_href = df.find('a')['href']
+        except:
+            df_href = ''
+        if df_href:
+            datas_df.append([df_place,df_href])
+    print(datas_df)
+    peoples = []
+    for data in datas_df:
+        place = data[0]
+        href = data[1]
+        req_df = requests.get(url=href, headers=headers)
+        soup_df = BeautifulSoup(req_df.content, 'html.parser')
+        df_list_df = soup_df.find_all('div', class_='left2')
+        for df in df_list_df:
+            try:
+                rwpart = df.find('div', class_='ren2')
+            except:
+                log.error(f'{place}===={href}')
+                continue
+            if rwpart:
+                pass
+            else:
+                continue
+            tr_list = rwpart.find_all('tr')
+            for tr in tr_list:
+                td_list = tr.find_all('td')
+                if len(td_list) == 3:
+                    leader = td_list[1].text
+                    try:
+                        leader_href = td_list[1].find('a')['href']
+                    except:
+                        leader_href = ''
+                        # continue
+                    position = td_list[2].text
+                    print(place, leader, position)
+                if len(td_list) == 2:
+                    leader = td_list[0].text
+                    try:
+                        leader_href = td_list[0].find('a')['href']
+                    except:
+                        leader_href = ''
+                        # continue
+                    position = td_list[1].text
+                    print(place, leader, position)
+                people = {
+                    'name': leader,  # 姓名
+                    'sex': '',  # 性别
+                    'work': position,  # 职务
+                    'birthplace': '',  # 出生地
+                    'birthday': '',  # 出生日期
+                    'company': '',  # 曾任单位
+                    'city': '',  # 关联城市
+                    'school': '',  # 毕业院校
+                    'province': '',  # 省或直辖市
+                    'type': 3,  # 直属类别（1：部委人物库  2：中直任务库  3：地方人物库）
+                    'department': '',  # 部门
+                    'headSculpture': '',  # 照片链接
+                }
+                # print(name)
+                peoples.append(people)
+

 def gwybw_task():
    # 实例化一个调度器
@@ -200,11 +270,12 @@ def zyzsjg_task():


 if __name__ == "__main__":
-    try:
-        gwybw_task()
-    except:
-        log.error('部委人物采集出错')
-    try:
-        zyzsjg_task()
-    except:
-        log.error('中直人物采集出错')
+    # try:
+    #     gwybw_task()
+    # except:
+    #     log.error('部委人物采集出错')
+    # try:
+    #     zyzsjg_task()
+    # except:
+    #     log.error('中直人物采集出错')
+    dfrwk()
--- a/comData/noticeReport/东方财富网-公告.py
+++ b/comData/noticeReport/东方财富网-公告.py
-import os
+import os
@@ -46,8 +46,28 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    if category == '.pdf':
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            if response.status_code != 200:
+                return retData
+            file_size = int(response.headers.get('Content-Length'))
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+            # todo:判断内容是否成功
+            if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                return retData
+            else:
+                pass
+        except:
+            log.error(f'文件损坏')
+            return retData
+    else:
        for i in range(0, 3):
            try:
+                page_size = 1
                response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
                if response.status_code != 200:
                    return retData
@@ -61,7 +81,6 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
            except:
                time.sleep(3)
                continue
-    page_size = 1
    name = str(getuuid()) + category
    try:
        result = getOBSres(pathType, name, response)
@@ -85,7 +104,7 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
        except Exception as e:
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            #baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData

        return retData

--- a/comData/noticeReport/东方财富网-港股公告-2.py
+++ b/comData/noticeReport/东方财富网-港股公告-2.py
+"""
+"""
+港股公告-更换采用接口的方式
+"""
+
+import os
+import subprocess
+import sys
+import uuid
+import fitz
+import requests
+from bs4 import BeautifulSoup
+import time, json
+from kafka import KafkaProducer
+from obs import ObsClient
+
+from urllib.parse import unquote
+
+from retry import retry
+
+from base import BaseCore
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+cnx = baseCore.cnx
+cursor = baseCore.cursor
+
+pathType = 'QYNotice/'
+
+
+cnx_ = baseCore.cnx_
+cursor_ = baseCore.cursor_
+
+obsClient = ObsClient(
+        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
+        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
+        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
+    )
+
+#获取文件大小
+def convert_size(size_bytes):
+    # 定义不同单位的转换值
+    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
+    i = 0
+    while size_bytes >= 1024 and i < len(units)-1:
+        size_bytes /= 1024
+        i += 1
+    return f"{size_bytes:.2f} {units[i]}"
+
+def getuuid():
+    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
+    return get_timestamp_uuid
+
+def uptoOBS(pdf_url,pdf_name,type_id,social_code):
+    headers = {}
+    category = os.path.splitext(pdf_url)[1]
+    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
+               'full_path': '',
+               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
+               'create_time': '', 'page_size': '', 'content': ''}
+    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    if category == '.pdf':
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            if response.status_code != 200:
+                return retData
+            file_size = int(response.headers.get('Content-Length'))
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+            # todo:判断内容是否成功
+            if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                return retData
+            else:
+                pass
+        except:
+            log.error(f'文件损坏')
+            return retData
+    else:
+        for i in range(0, 3):
+            try:
+                page_size = 1
+                response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
+                if response.status_code != 200:
+                    return retData
+                file_size = int(response.headers.get('Content-Length'))
+                retData['content'] = response.text
+                #todo:判断内容是否成功
+                if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                    return retData
+                else:
+                    break
+            except:
+                time.sleep(3)
+                continue
+    name = str(getuuid()) + category
+    try:
+        result = getOBSres(pathType, name, response)
+    except:
+        log.error(f'OBS发送失败')
+        return retData
+
+    if page_size < 1:
+        # pdf解析失败
+        # print(f'======pdf解析失败=====')
+        return retData
+    else:
+        try:
+            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+            retData['state'] = True
+            retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
+            retData['full_path'] = unquote(result['body']['objectUrl'])
+            retData['file_size'] = convert_size(file_size)
+            retData['create_time'] = time_now
+            retData['page_size'] = page_size
+        except Exception as e:
+            state = 0
+            takeTime = baseCore.getTimeCost(start_time, time.time())
+            #baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            return retData
+
+        return retData
+
+@retry(tries=3, delay=1)
+def getOBSres(pathType,name, response):
+    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
+    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
+    return result
+
+def secrchATT(item_id, retData, type_id):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s '''
+    cursor_.execute(sel_sql, (item_id, retData['path'], type_id))
+    selects = cursor_.fetchone()
+    return selects
+
+# 插入到att表 返回附件id
+def tableUpdate(retData, com_name, year, pdf_name, num):
+    item_id = retData['item_id']
+    type_id = retData['type_id']
+    group_name = retData['group_name']
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    status = retData['status']
+    create_by = retData['create_by']
+    page_size = retData['page_size']
+    create_time = retData['create_time']
+    order_by = num
+    # selects = secrchATT(item_id, pdf_name, type_id)
+    #
+    # if selects:
+    #     log.info(f'pdf_name:{pdf_name}已存在')
+    #     id = ''
+    #     return id
+    # else:
+    try:
+        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+
+        values = (
+            year, pdf_name+category, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+            status, create_by,
+            create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
+        cursor_.execute(Upsql, values)  # 插入
+        cnx_.commit()  # 提交
+    except Exception as e:
+        log.info(e)
+    log.info(f"更新完成:{item_id}===={pdf_name+category}")
+    try:
+        selects = secrchATT(item_id, retData, type_id)
+    except Exception as e:
+        log.info(e)
+        return ''
+    id = selects[0]
+    return id
+
+
+def InsterInto(social_code, pdf_url,pub_time,pdf_name):
+    insert = False
+    # 信息插入数据库
+    try:
+        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''
+
+        list_info = [
+            social_code,
+            pdf_url,
+            '东方财富网',
+            '1',
+            pub_time[:10],
+            pdf_name
+        ]
+        #144数据库
+        cursor.execute(insert_sql, tuple(list_info))
+        cnx.commit()
+        insert = True
+        return insert
+    except:
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
+        return insert
+
+
+def ifInstert(short_name, social_code, title, info_date):
+    ifexist = True
+    aa = info_date[:10]
+    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and title = %s and publish_time = %s and origin='东方财富网' and type='1' '''
+    cursor.execute(sel_sql, (social_code, title, aa))
+    selects = cursor.fetchone()
+    #如果数据库中存在 则跳过
+    if selects:
+        ifexist = False
+        log.info(f'com_name:{short_name}、{title}, {info_date}已存在')
+        return ifexist
+    else:
+        return ifexist
+
+def sendKafka(social_code,newsUrl,dic_news):
+
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024*1024*20)
+        kafka_result = producer.send("researchReportNoticeTopic",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+        print(kafka_result.get(timeout=10))
+
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        log.info(dic_result)
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, newsUrl, 'Kafka操作失败')
+        log.info(dic_result)
+        return False
+
+def GetContent(pdf_url,info_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
+    # 上传至华为云服务器
+    retData = uptoOBS(pdf_url, pdf_name, 8, social_code)
+    # 附件插入att数据库
+    if retData['state']:
+        pass
+    else:
+        log.info(f'====pdf解析失败====')
+        # 获取当前进程pid
+        current_pid = baseCore.getPID()
+        # todo: 重新启动新进程，杀死当前进程
+        subprocess.Popen([sys.executable] + sys.argv)
+        os.kill(current_pid, 9)
+        return False
+    num = num + 1
+    att_id = tableUpdate(retData, com_name, year, pdf_name, num)
+    if att_id:
+        pass
+    else:
+        return False
+    content = retData['content']
+    lang = baseCore.detect_language(content)
+    if lang == 'cn':
+        lang = 'zh'
+    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    dic_news = {
+        'attachmentIds': att_id,
+        'author': '',
+        'content': content,
+        'contentWithTag': '',
+        'createDate': time_now,
+        'deleteFlag': '0',
+        'id': '',
+        'keyWords': '',
+        'lang': lang,
+        'origin': '东方财富网',
+        'publishDate': pub_time,
+        'sid': '1684032033495392257',
+        'sourceAddress': info_url,  # 原文链接
+        'summary': '',
+        'title': pdf_name.replace('.pdf', ''),
+        'type': 3,
+        'socialCreditCode': social_code,
+        'year': year
+    }
+    # print(dic_news)
+    # 将相应字段通过kafka传输保存
+    try:
+        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
+        kafka_result = producer.send("researchReportNoticeTopic",
+                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))
+
+        print(kafka_result.get(timeout=10))
+
+        dic_result = {
+            'success': 'ture',
+            'message': '操作成功',
+            'code': '200',
+        }
+        log.info(dic_result)
+        return True
+    except Exception as e:
+        dic_result = {
+            'success': 'false',
+            'message': '操作失败',
+            'code': '204',
+            'e': e
+        }
+        state = 0
+        takeTime = baseCore.getTimeCost(start_time, time.time())
+        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
+        log.info(dic_result)
+        return False
+
+
+def gonggao_info(dic_info):
+    code = dic_info[3]
+    com_name = dic_info[1]
+    social_code = dic_info[2]
+    if 'HK' in code:
+        pass
+    else:
+        return
+
+           #https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index=1&ann_type=H&client_source=web&stock_list=00175&f_node=0
+    url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index=1&ann_type=H&client_source=web&stock_list={code.split(".HK")[0]}&f_node=0'
+    for n1 in range(0, 3):
+        try:
+            res = requests.get(url, verify=False)
+            break
+        except:
+            if n1 == 2:
+                sys.exit(0)
+            time.sleep(5)
+            continue
+
+    res_json = res.json()
+    total_hits = res_json['data']['total_hits']
+    for page1 in range(1,total_hits+1):
+        url = f'https://np-anotice-stock.eastmoney.com/api/security/ann?sr=-1&page_size=50&page_index={page1}&ann_type=H&client_source=web&stock_list={code.split(".HK")[0]}&f_node=0'
+        for n1 in range(0, 3):
+            try:
+                res = requests.get(url, verify=False)
+                break
+            except:
+                if n1 == 2:
+                    sys.exit(0)
+                time.sleep(5)
+                continue
+        res_json = res.json()
+        list_all = res_json['data']['list']
+        if list_all:
+            for one_info in list_all:
+                title = one_info['title']
+                info_date = one_info['notice_date']
+                year = info_date[:4]
+                # if page1 > 1 and '2022' in info_date:
+                #     break_id = 1
+                #     break
+                # if '2021' in info_date:  # 只采集22年以后的数据
+                #     break_id = 1
+                #     break
+                try:
+                    info_type = one_info['columns'][0]['column_name']
+                except:
+                    info_type = ''
+                art_code = one_info['art_code']
+                info_url = 'https://data.eastmoney.com/notices/detail/' + code + '/' + art_code + '.html'
+                t = int(time.time() * 1000)
+                            # https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code=AN202308221595478274&client_source=web&page_index=1&_=1708918810986
+                json_url = f'https://np-cnotice-stock.eastmoney.com/api/content/ann?art_code={art_code}&client_source=web&page_index=1&_={t}'
+
+                for n1 in range(0, 3):
+                    try:
+                        ip = baseCore.get_proxy()
+                        json_2 = requests.get(json_url, proxies=ip,verify=False).json()
+                        break
+                    except:
+                        if n1 == 2:
+                            sys.exit(0)
+                        time.sleep(60)
+                        continue
+                try:
+                    pdf_url = json_2['data']['attach_url']
+                except:
+                    pdf_url = ''
+                try:
+                    info_content = json_2['data']['notice_content']
+                except:
+                    info_content = ''
+                ifexist = ifInstert(com_name, social_code, title, info_date)
+                # ifexist = True
+                if ifexist:
+                    # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
+                    result = GetContent(pdf_url, info_url,title, social_code, year, info_date, start_time, com_name, num)
+                    if result:
+                        # 公告信息列表
+                        log.info(f'{com_name}==============解析传输操作成功')
+                        state = 1
+                        takeTime = baseCore.getTimeCost(start_time, time.time())
+                        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')
+
+                        # 发送kafka成功之后 再插入数据库
+                        insert = InsterInto(social_code, info_url, info_date, title)
+                        if insert:
+                            log.info(f'===={social_code}========{title}=====插入库成功')
+                        pass
+                    else:
+                        continue
+                else:
+                    log.info(f'======={com_name}========{code}===已存在')
+                    continue
+
+if __name__ =='__main__':
+    #从redis中读取social_code'
+
+    list_c = []
+    list_all_info_1 = []
+    num = 0
+    taskType = '企业公告/东方财富网'
+    while True:
+        start_time = time.time()
+        # 获取企业信息
+        # social_code = baseCore.redicPullData('NoticeEnterprise:ggqy_socialCode_add')
+        social_code = '91330000747735638J'
+        if not social_code:
+            time.sleep(20)
+            continue
+        if social_code == 'None':
+            time.sleep(20)
+            continue
+        if social_code == '':
+            time.sleep(20)
+            continue
+        dic_info = baseCore.getInfomation(social_code)
+        # count = dic_info[15]
+        code = dic_info[3]
+        com_name = dic_info[1]
+        log.info(f'-----开始处理{com_name}----{social_code}------')
+        try:
+            gonggao_info(dic_info)
+        except:
+            log.info(f'-----error:{com_name}----{social_code}------')
+        break
+
+
+
+
--- a/comData/noticeReport/东方财富网-港股公告.py
+++ b/comData/noticeReport/东方财富网-港股公告.py
-import os
+import os
@@ -48,6 +48,7 @@ def getuuid():
    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
    return get_timestamp_uuid

+
 def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
    category = os.path.splitext(pdf_url)[1]
@@ -56,15 +57,40 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    headers['User-Agent'] = baseCore.getRandomUserAgent()
+    if category == '.pdf':
+        try:
+            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
+            if response.status_code != 200:
+                return retData
+            file_size = int(response.headers.get('Content-Length'))
+            with fitz.open(stream=response.content, filetype='pdf') as doc:
+                page_size = doc.page_count
+                for page in doc.pages():
+                    retData['content'] += page.get_text()
+            # todo:判断内容是否成功
+            if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                return retData
+            else:
+                pass
+        except:
+            log.error(f'文件损坏')
+            return retData
+    else:
        for i in range(0, 3):
            try:
-            ip = baseCore.get_proxy()
-            response = requests.get(pdf_url, headers=headers,verify=False,proxies=ip, timeout=20)
+                page_size = 1
+                response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
+                if response.status_code != 200:
+                    return retData
                file_size = int(response.headers.get('Content-Length'))
                retData['content'] = response.text
+                #todo:判断内容是否成功
+                if '<div class="K">403</div>' in retData['content'] or 'Error Times: ' in retData['content']:
+                    return retData
+                else:
                    break
-        except Exception as e:
-            time.sleep(60)
+            except:
+                time.sleep(3)
                continue

    name = str(getuuid()) + category
@@ -73,12 +99,6 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    except:
        log.error(f'OBS发送失败')
        return retData
-    try:
-        with fitz.open(stream=response.content, filetype='pdf') as doc:
-            page_size = doc.page_count
-    except:
-        log.error(f'文件损坏')
-        return retData
    if page_size < 1:
        # pdf解析失败
        # print(f'======pdf解析失败=====')
@@ -95,11 +115,12 @@ def uptoOBS(pdf_url,pdf_name,type_id,social_code):
        except Exception as e:
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
-            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
+            #baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData

        return retData

+
 @retry(tries=3, delay=1)
 def getOBSres(pathType,name, response):
    result = obsClient.putContent('zzsn', pathType + name, content=response.content)

--- a/comData/weixin_solo/get_tokenCookies.py
+++ b/comData/weixin_solo/get_tokenCookies.py
@@ -56,7 +56,7 @@ if __name__=="__main__":
    url = "https://mp.weixin.qq.com/"
    browser.get(url)
    # 可改动
-    time.sleep(20)
+    time.sleep(80)

    s = requests.session()
    #获取到token和cookies

--- a/gwzk/1_国外智库.txt
+++ b/gwzk/1_国外智库.txt
+联合国：https://www.un-ilibrary.org/content/papers/27082822
+联合国：https://www.un-ilibrary.org/content/papers/27082822
+
+世界经贸组织
+https://docs.wto.org/dol2fe/Pages/FE_Search/FE_S_S006.aspx?Language=English&SourcePage=FE_B_009&Context=Script&DataSource=Cat&Query=(%40Symbol%3d%22WT%2fLET*%22+AND+(%40Title%3d(modifications+OR+rectifications)+AND+schedule))&languageUIChanged=true
+
+经合组织
+https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en
+
+国际化经营-欧盟
+https://ec.europa.eu/eurostat/databrowser/explore/all/tb_eu?lang=en&display=list&sort=category
\ No newline at end of file
--- a/gwzk/BaseCore.py
+++ b/gwzk/BaseCore.py
@@ -272,34 +272,6 @@ class BaseCore:
        # 连接到Redis
        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)

-        self.pool_caiji = PooledDB(
-            creator=pymysql,
-            maxconnections=5,
-            mincached=2,
-            maxcached=5,
-            blocking=True,
-            host='114.115.159.144',
-            port=3306,
-            user='caiji',
-            password='zzsn9988',
-            database='caiji',
-            charset='utf8mb4'
-        )
-
-        self.pool_11 = PooledDB(
-            creator=pymysql,
-            maxconnections=5,
-            mincached=2,
-            maxcached=5,
-            blocking=True,
-            host='114.116.44.11',
-            port=3306,
-            user='caiji',
-            password='f7s0&7qqtK',
-            database='clb_project',
-            charset='utf8mb4'
-        )
-
    def check_mysql_conn(self,conn):
        try:
            conn.ping()
@@ -461,16 +433,6 @@ class BaseCore:
                panduan = ppp
        return panduan

-    # # 从Redis的List中获取并移除一个元素
-    # def redicPullData(self,type,key):
-    # #1 表示国内 2 表示国外
-    #     if type == 1:
-    #         gn_item = self.r.lpop(key)
-    #         return gn_item.decode() if gn_item else None
-    #     if type == 2:
-    #         gw_item = self.r.lpop(key)
-    #         return gw_item.decode() if gw_item else None
-
    # 从Redis的List中获取并移除一个元素
    def redicPullData(self, key):
        try:
@@ -496,460 +458,3 @@ class BaseCore:
            os.makedirs(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
        else:
            pass
-
-    # 生成google模拟浏览器  必须传入值为googledriver位置信息
-    # headless用于决定是否为无头浏览器,初始默认为无头浏览器
-    # 正常浏览器可用于开始对页面解析使用或一些网站无头时无法正常采集
-    # 无头浏览器用于后续对信息采集时不会有浏览器一直弹出，
-    def buildDriver(self, path, headless=True):
-
-        service = Service(path)
-        chrome_options = webdriver.ChromeOptions()
-        if headless:
-            chrome_options.add_argument('--headless')
-            chrome_options.add_argument('--disable-gpu')
-        chrome_options.add_experimental_option(
-            "excludeSwitches", ["enable-automation"])
-        chrome_options.add_experimental_option('useAutomationExtension', False)
-        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
-
-        chrome_options.add_argument('user-agent=' + self.getRandomUserAgent())
-        # 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
-        driver = webdriver.Chrome(options=chrome_options, service=service)
-        # with open(r'F:\zzsn\zzsn_spider\base\stealth.min.js') as f:
-        #     js = f.read()
-        #
-        # driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
-        #     "source": js
-        # })
-        return driver
-
-    # 根据社会信用代码获取企业信息
-    def getInfomation(self, social_code):
-        data = []
-        try:
-            sql = f"SELECT * FROM EnterpriseInfo WHERE SocialCode = '{social_code}'"
-            # self.cursor.execute(sql)
-            # data = self.cursor.fetchone()
-            conn = self.pool_caiji.connection()
-            cursor = conn.cursor()
-            cursor.execute(sql)
-            data = cursor.fetchone()
-            conn.commit()
-            data = list(data)
-            cursor.close()
-            conn.close()
-        except:
-            log = self.getLogger()
-            log.info('=========数据库操作失败========')
-        return data
-
-    # 根据企业名称获取企业信息
-    def getBYnameInfomation(self, com_name):
-        data = []
-        try:
-            sql = f"SELECT * FROM EnterpriseInfo WHERE CompanyName = '{com_name}'"
-            # self.cursor.execute(sql)
-            # data = self.cursor.fetchone()
-            conn = self.pool_caiji.connection()
-            cursor = conn.cursor()
-            cursor.execute(sql)
-            data = cursor.fetchone()
-            conn.commit()
-            data = list(data)
-            cursor.close()
-            conn.close()
-        except:
-            log = self.getLogger()
-            log.info('=========数据库操作失败========')
-
-        return data
-
-        # 根据企业名称获取企业信息
-
-    def getBYtycidInfomation(self, com_name):
-        data = []
-        try:
-            sql = f"SELECT * FROM EnterpriseInfo WHERE TYCID = '{com_name}'"
-            # self.cursor.execute(sql)
-            # data = self.cursor.fetchone()
-            conn = self.pool_caiji.connection()
-            cursor = conn.cursor()
-            cursor.execute(sql)
-            data = cursor.fetchone()
-            conn.commit()
-            data = list(data)
-            cursor.close()
-            conn.close()
-        except:
-            log = self.getLogger()
-            log.info('=========数据库操作失败========')
-
-        return data
-    # 更新企业采集次数
-    def updateRun(self, social_code, runType, count):
-        try:
-            sql_update = f"UPDATE EnterpriseInfo SET {runType} = {count} WHERE SocialCode = '{social_code}'"
-            # self.cursor.execute(sql_update)
-            # self.cnx.commit()
-            conn = self.pool_caiji.connection()
-            cursor = conn.cursor()
-            cursor.execute(sql_update)
-            conn.commit()
-            cursor.close()
-            conn.close()
-        except:
-            log = self.getLogger()
-            log.info('======更新数据库失败======')
-
-    # 保存日志入库
-    def recordLog(self, xydm, taskType, state, takeTime, url, e):
-        try:
-            createTime = self.getNowTime(1)
-            ip = self.getIP()
-            pid = self.getPID()
-            sql = "INSERT INTO LogTable(SocialCode,TaskType,state,TakeTime,url,CreateTime,ProcessIp,PID,Exception) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
-            values = [xydm, taskType, state, takeTime, url, createTime, ip, pid, e]
-            # try:
-            #     self.cursor.execute(sql, values)
-            # except Exception as e:
-            #     print(e)
-            # self.cnx.commit()
-            cnn = self.pool_caiji.connection()
-            cursor = cnn.cursor()
-            cursor.execute(sql, values)
-            cnn.commit()
-            cursor.close()
-            cnn.close()
-        except:
-            log = self.getLogger()
-            log.info('======保存日志失败=====')
-
-    # 获取企查查token
-    def GetToken(self):
-
-        # 获取企查查token
-        query = "select token from QCC_token "
-        # token = '67ec7402166df1da84ae83c4b95cefc0'  # 需要隔两个小时左右抓包修改
-        self.cursor.execute(query)
-        token_list = self.cursor.fetchall()
-        self.cnx.commit()
-        try:
-            token = token_list[random.randint(0, len(token_list) - 1)][0]
-        except:
-            token = ''
-        return token
-
-    # 删除失效的token
-    def delete_token(self, token):
-        deletesql = f"delete from QCC_token where token='{token}' "
-        self.cursor.execute(deletesql)
-        self.cnx.commit()
-
-    # 获取天眼查token
-    def GetTYCToken(self):
-        query = 'select token from TYC_token'
-        self.cursor.execute(query)
-        token = self.cursor.fetchone()[0]
-        self.cnx.commit()
-        return token
-
-    # 检测语言
-    def detect_language(self, text):
-        # 使用langid.py判断文本的语言
-        result = langid.classify(text)
-        if result == '':
-            return 'cn'
-        if result[0] == '':
-            return 'cn'
-        if result[0] == 'ja':
-            return 'jp'
-        if result[0] == 'fr':
-            return 'fra'
-        if result[0] == 'es':
-            return 'spa'
-        if result[0] == 'fi':
-            return 'fin'
-        if result[0] == 'vi':
-            return 'vie'
-        if result[0] == 'ko':
-            return 'kor'
-        if result[0] == 'da':
-            return 'dan'
-        return result[0]
-
-    #创建excel文件
-    def check_excel_file(self,file_path):
-        if os.path.isfile(file_path):
-            self.getLogger().info("Excel文件已存在")
-            return True
-        else:
-            self.getLogger().info("Excel文件不存在，正在创建...")
-            return False
-
-
-    # 追加接入excel
-    def writerToExcel(self, detailList, filename):
-        # filename='baidu搜索.xlsx'
-        # 读取已存在的xlsx文件
-        existing_data = pd.read_excel(filename, engine='openpyxl', dtype=str)
-        # 创建新的数据
-        new_data = pd.DataFrame(data=detailList)
-        # 将新数据添加到现有数据的末尾
-        combined_data = existing_data.append(new_data, ignore_index=True)
-        # 将结果写入到xlsx文件
-        combined_data.to_excel(filename, index=False)
-        # return combined_data
-
-    # 对失败或者断掉的企业 重新放入redis
-    def rePutIntoR(self, key, item):
-        try:
-            self.r.ping()
-        except:
-            self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
-        self.r.rpush(key, item)
-
-    # 增加计数器的值并返回增加后的值
-    def incrSet(self, key):
-        # 增加计数器的值并返回增加后的值
-        new_value = self.r.incr(key)
-        print("增加后的值：", new_value)
-        return new_value
-
-    # 获取key剩余的过期时间
-    def getttl(self, key):
-        # 获取key的剩余过期时间
-        ttl = self.r.ttl(key)
-        print("剩余过期时间：", ttl)
-        # 判断key是否已过期
-        if ttl < 0:
-            # key已过期，将key的值重置为0
-            self.r.set(key, 0)
-            self.r.expire(key, 3600)
-            time.sleep(2)
-
-    # # 上传至文件服务器,并解析pdf的内容和页数
-    # def upLoadToServe(self, pdf_url, type_id, social_code):
-    #     headers = {}
-    #     retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
-    #                'full_path': '',
-    #                'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
-    #                'create_time': '', 'page_size': '', 'content': ''}
-    #     headers['User-Agent'] = self.getRandomUserAgent()
-    #     for i in range(0, 3):
-    #         try:
-    #             resp_content = requests.get(pdf_url, headers=headers, verify=False, timeout=20).content
-    #             break
-    #         except:
-    #             time.sleep(3)
-    #             continue
-    #     page_size = 0
-    #
-    #     for i in range(0, 3):
-    #         try:
-    #             result = client.upload_by_buffer(resp_content, file_ext_name='pdf')
-    #             with fitz.open(stream=resp_content, filetype='pdf') as doc:
-    #                 page_size = doc.page_count
-    #                 for page in doc.pages():
-    #                     retData['content'] += page.get_text()
-    #             break
-    #         except:
-    #             time.sleep(3)
-    #             continue
-    #     if page_size < 1:
-    #         # pdf解析失败
-    #         print(f'======pdf解析失败=====')
-    #         return retData
-    #     else:
-    #         time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-    #         retData['state'] = True
-    #         retData['path'] = bytes.decode(result['Remote file_id']).replace('group1', '')
-    #         retData['full_path'] = bytes.decode(result['Remote file_id'])
-    #         retData['file_size'] = result['Uploaded size']
-    #         retData['create_time'] = time_now
-    #         retData['page_size'] = page_size
-    #
-    #         return retData
-
-
-    def deliteATT(self,id):
-        delitesql = f"delete from clb_sys_attachment where id = '{id}' "
-        self.cursor_.execute(delitesql)
-        self.cnx_.commit()
-
-    def secrchATT(self, item_id, year, type_id):
-        sel_sql = '''select id from clb_sys_attachment where item_id = %s and year = %s and type_id=%s '''
-        self.cursor_.execute(sel_sql, (item_id, year, type_id))
-        selects = self.cursor_.fetchone()
-        return selects
-
-    # 插入到att表 返回附件id
-    def tableUpdate(self, retData, com_name, year, pdf_name, num, pub_time,origin):
-        item_id = retData['item_id']
-        type_id = retData['type_id']
-        group_name = retData['group_name']
-        path = retData['path']
-        full_path = retData['full_path']
-        category = retData['category']
-        file_size = retData['file_size']
-        status = retData['status']
-        create_by = retData['create_by']
-        page_size = retData['page_size']
-        create_time = retData['create_time']
-        order_by = num
-        selects = self.secrchATT(item_id, year, type_id)
-
-        if selects:
-            self.getLogger().info(f'com_name:{com_name}--{year}已存在')
-            id = ''
-            return id
-        else:
-            Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
-
-            values = (
-                year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
-                status, create_by,
-                create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
-                pub_time,origin)
-
-            self.cursor_.execute(Upsql, values)  # 插入
-            self.cnx_.commit()  # 提交
-            self.getLogger().info("更新完成:{}".format(Upsql))
-            selects = self.secrchATT(item_id, year, type_id)
-            id = selects[0]
-            return id
-
-    # 更新企业的CIK
-    def updateCIK(self, social_code, cik):
-        try:
-            sql = f"UPDATE EnterpriseInfo SET CIK = '{cik}' WHERE SocialCode = '{social_code}'"
-            cnn = self.pool_caiji.connection()
-            cursor = cnn.cursor()
-            cursor.execute(sql)
-            cnn.commit()
-            cursor.close()
-            cnn.close()
-        except:
-            log = self.getLogger()
-            log.info('======保存企业CIK失败=====')
-
-    # 上传至obs华为云服务器，并解析破地方的内容和页数
-    # 获取文件大小
-    def convert_size(self, size_bytes):
-        # 定义不同单位的转换值
-        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
-        i = 0
-        while size_bytes >= 1024 and i < len(units) - 1:
-            size_bytes /= 1024
-            i += 1
-        return f"{size_bytes:.2f} {units[i]}"
-
-    # 查看obs文件是否已经上传
-    def obsexist(self, file_path):
-        # # 文件路径
-        # file_path = 'XQWAnnualReport/2023-10/浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告.doc'
-
-        # 检查文件是否存在
-        response = obsClient.getObjectMetadata('zzsn', file_path)
-
-        if response.status >= 300:
-            self.getLogger().info('=====文件不存在obs=====')
-            return True
-        else:
-            self.getLogger().info(f'=====文件存在obs========{file_path}')
-
-    #uuid 根据时间戳生成 文件名 上传到obs
-
-    def getuuid(self):
-        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
-        return get_timestamp_uuid
-
-    def uptoOBS(self, pdf_url, name_pdf, type_id, social_code, pathType, taskType, start_time,create_by,headers):
-        retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
-                   'full_path': '',
-                   'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': create_by,
-                   'create_time': '', 'page_size': '', 'content': ''}
-        for i in range(0, 3):
-            try:
-                response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
-                file_size = int(response.headers.get('Content-Length'))
-                break
-            except:
-                time.sleep(3)
-                continue
-        page_size = 0
-        name = str(self.getuuid()) + '.pdf'
-        now_time = time.strftime("%Y-%m")
-        try:
-            result = self.getOBSres(pathType, now_time, name, response)
-        except:
-            log = self.getLogger()
-            log.error(f'OBS发送失败')
-            return retData
-        try:
-            with fitz.open(stream=response.content, filetype='pdf') as doc:
-                page_size = doc.page_count
-                for page in doc.pages():
-                    retData['content'] += page.get_text()
-        except:
-            log = self.getLogger()
-            log.error(f'文件损坏')
-            return retData
-
-        if page_size < 1:
-            # pdf解析失败
-            # print(f'======pdf解析失败=====')
-            return retData
-        else:
-            try:
-                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
-                retData['state'] = True
-                retData['path'] = result['body']['objectUrl'].split('.com')[1]
-                retData['full_path'] = result['body']['objectUrl']
-                retData['file_size'] = self.convert_size(file_size)
-                retData['create_time'] = time_now
-                retData['page_size'] = page_size
-            except Exception as e:
-                state = 0
-                takeTime = self.getTimeCost(start_time, time.time())
-                self.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
-                return retData
-
-            return retData
-
-    @retry(tries=3, delay=1)
-    def getOBSres(self, pathType, now_time, name, response):
-        result = obsClient.putContent('zzsn', pathType + name, content=response.content)
-        # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
-        return result
-
-    def sendEmail(self, file_name):
-
-        file = open(file_name, 'rb').read()
-        # 发送邮箱地址
-        sender = '1195236739@qq.com'
-        # 接收邮箱地址
-        receiver = 'fujunxue@ciglobal.cn'
-        smtpserver = 'smtp.qq.com'
-        # 发送邮箱登录 账户 密码
-        username = '1195236739@qq.com'
-        password = 'gatvszshadvpgjci'
-
-        maile_title = '企业基本信息采集情况'
-
-        message = MIMEMultipart()
-        message['From'] = sender
-        message['To'] = receiver
-        message['Subject'] = Header(maile_title, 'utf-8')
-
-        message.attach(MIMEText('企业基本信息采集情况', 'plain', 'utf-8'))
-
-        xlsxApart = MIMEApplication(file)
-        xlsxApart.add_header('Content-Disposition', 'attachment', filename='企业基本信息采集情况.xlsx')
-        message.attach(xlsxApart)
-        smtpObj = smtplib.SMTP_SSL(smtpserver)  # 注意：如果遇到发送失败的情况（提示远程主机拒接连接），这里要使用SMTP_SSL方法
-        smtpObj.connect(smtpserver, port=465)
-        smtpObj.login(username, password)
-        smtpObj.sendmail(sender, receiver, message.as_string())
-        print("邮件发送成功！！！")
-        smtpObj.quit()
--- a/gwzk/europa.py
+++ b/gwzk/europa.py
+"""
+国外智库-欧盟 经合组织
+"""
+import json
+import time
+
+import pymongo
+from bs4 import BeautifulSoup
+import requests
+from datetime import datetime
+
+from kafka import KafkaProducer
+from retry import retry
+
+import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').ZZSN[
+    '国外智库']
+
+@retry(tries=2, delay=5)
+def sendKafka(dic):
+    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+    kafka_result = producer.send("research_center_fourth",
+                                 json.dumps(dic, ensure_ascii=False).encode('utf8'))
+    log.info(f'{dic["sourceAddress"]}传输成功')
+
+def secrchATT(item_id, retData, type_id, order_by):
+    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
+    baseCore.cursor_.execute(sel_sql, (item_id, retData['path'], type_id, order_by))
+    selects = baseCore.cursor_.fetchone()
+    return selects
+
+# 插入到att表 返回附件id
+def tableUpdate(retData, file_name, num, publishDate,origin):
+    item_id = retData['item_id']
+    type_id = retData['type_id']
+    group_name = retData['group_name']
+    path = retData['path']
+    full_path = retData['full_path']
+    category = retData['category']
+    file_size = retData['file_size']
+    status = retData['status']
+    create_by = retData['create_by']
+    page_size = retData['page_size']
+    create_time = retData['create_time']
+    order_by = num
+    object_key = full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1]
+
+    Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
+
+    values = (
+        file_name+'.pdf', type_id, item_id, group_name, path, full_path, category, file_size, order_by,
+        status, create_by,
+        create_time, object_key, 'zzsn', publishDate,origin)
+
+    baseCore.cursor_.execute(Upsql, values)  # 插入
+    baseCore.cnx_.commit()  # 提交
+    baseCore.getLogger().info("更新完成:{}".format(Upsql))
+    selects = secrchATT(item_id, retData, type_id, order_by)
+    id = selects[0]
+    return id
+
+def save_data(dic_news):
+    aaa_dic = {
+        '附件id': dic_news['attachmentIds'],
+        '网址': dic_news['sourceAddress'],
+        'tid': '',
+        '来源': f"经济合作与发展组织",
+        '创建时间': dic_news['createDate'],
+        '带标签内容': dic_news['contentWithTag'][:100],
+        '发布时间': dic_news['publishDate'],
+        '标题': dic_news['title']
+    }
+    db_storage.insert_one(aaa_dic)
+
+@retry(tries=2, delay=5)
+def translate(title, contentWithTag):
+    headers = {
+        'Content-Type': 'application/json',
+    }
+    dic_info = {
+        'title': title,
+        # 'summary': '<div>apple</div>',
+        'contentWithTag': contentWithTag
+    }
+    dic_info = json.dumps(dic_info)
+    req = requests.post('http://117.78.23.14:5001/translate', data=dic_info, headers=headers)
+    dataJson = req.json()
+    if dataJson['status'] == 'failed':
+        raise
+    titleRaw = dataJson['title']
+    contentWithTagRaw = dataJson['contentWithTag']
+    titleRaw = BeautifulSoup(titleRaw,'html.parser')
+    titleRaw = titleRaw.text
+    contentWithTagRaw = BeautifulSoup(contentWithTagRaw,'html.parser')
+    return titleRaw, contentWithTagRaw
+
+def doJob():
+    num = 1
+    url = 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=1'
+    headers = {
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Accept-Language': 'zh-CN,zh;q=0.9',
+    'Cache-Control': 'max-age=0',
+    'Cookie': 'JSESSIONID=BHezogPwi8NJVECsKXCXqijdQ00-yMJHw_gR8wiC.ip-10-240-5-121; __cf_bm=c2byUypnSjXPS_UFDM7BMRGDxN6AQEkNVUjzw9HuSq8-1707054653-1-AbbI7JWWkfWKVGi8SKI06f0jGEjPdk5kvHAIRRpBHSSSnmxj1IcvGUT8+/O6R0U2RLZJECZdUzZIXAwFuEz5lPo=; _gcl_au=1.1.201344533.1707054655; _gid=GA1.2.557164000.1707054655; cb-enabled=enabled; cf_clearance=6tK6.WKHJbXXoV4NTgbyHRhetRxMdWPZofwlv01F65Y-1707054656-1-AfrYlWnLLZFC1sKxeFVQintPrZnjvjoJSZwRRhAYwqRHGdWbU5IFZQDJZJM21l20Tj6gk4JxNobWT0wGzp1Dgjw=; _ce.irv=new; cebs=1; _ce.clock_event=1; _ce.clock_data=72%2C123.149.3.159%2C1%2C9c1ce27f08b16479d2e17743062b28ed; custom_cookie_AB=1; AWSALB=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; AWSALBCORS=I/eGQ0glcxuROskD1JKEl/dqsqElpmo/MnwLboJZJB2QthQFFWnLA3gzuJTskEaZxJD7VuWEEsqjhLVvhq4q2Wt0RebuRhukeHpKvgmGMelxpn/RiDmehyvxTOiS; _gat_UA-1887794-2=1; _dc_gtm_UA-136634323-1=1; _ga_F5XZ540Q4V=GS1.1.1707054655.1.1.1707055119.7.0.0; _ga=GA1.1.1014316406.1707054655; _ga_F7KSNTXTRX=GS1.1.1707054655.1.1.1707055119.0.0.0; cebsp_=5; _ce.s=v~212f033193b9432855ae8335d6d3969cc1f8b751~lcw~1707055134688~lva~1707054658247~vpv~0~v11.fhb~1707054659602~v11.lhb~1707055126493~v11.cs~325107~v11.s~6d7ba630-c364-11ee-aba8-136dbbf9a447~v11.sla~1707055134688~v11.send~1707055135439~lcw~1707055135439',
+    'Referer': 'https://www.oecd-ilibrary.org/economics/oecd-policy-responses-on-the-impacts-of-the-war-in-ukraine_dc825602-en?page=2',
+    'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+    'Sec-Ch-Ua-Mobile': '?0',
+    'Sec-Ch-Ua-Platform': '"Windows"',
+    'Sec-Fetch-Dest': 'document',
+    'Sec-Fetch-Mode': 'navigate',
+    'Sec-Fetch-Site': 'same-origin',
+    'Sec-Fetch-User': '?1',
+    'Upgrade-Insecure-Requests': '1',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    }
+    req = requests.get(url=url, headers=headers)
+    soup = BeautifulSoup(req.content, 'html.parser')
+    div_part = soup.find_all('div', class_='col-xs-12 body-section')[1]
+    div_list = div_part.find_all('div', class_='row panel')
+    for div in div_list:
+        start_time = time.time()
+        title = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title').text
+        href = 'https://www.oecd-ilibrary.org' + div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('a')['href']
+        is_href = db_storage.find_one({'网址': href})
+        if is_href:
+            log.info(f'{href}===已采集')
+            continue
+        pubtime_ = div.find('div', class_='col-lg-7 col-xs-12 resume-item').find('p', class_='intro-item').find('strong', class_='book-title gray').text
+        # 定义原始时间的格式
+        time_format = "%d %b %Y"
+        # 转换为标准时间
+        standard_time = datetime.strptime(pubtime_, time_format).strftime("%Y-%m-%d")
+        if standard_time > '2023-01-30':
+            pass
+        else:
+            break
+        year = standard_time[:4]
+        pdf_part = div.find('div', class_='col-lg-5 col-xs-12 actions-item').find('ul', class_='actions').find_all('li')[1].find('a').get('href')
+        pdf_url = 'https://www.oecd-ilibrary.org' + pdf_part
+
+        req_news = requests.get(url=href, headers=headers)
+        soup_news = BeautifulSoup(req_news.content, 'html.parser')
+        # print(title, standard_time, pdf_url, href)
+        contentWithTag = soup_news.find('div', class_='description js-desc-fade show-all')
+        content = contentWithTag.get_text()
+        # todo:翻译
+        try:
+            titleRaw, contentWithTagRaw = translate(str(title), str(contentWithTag))
+            log.info(f'{href}===翻译成功')
+        except Exception as e:
+            log.error(f'{href}===翻译失败==={e}')
+            continue
+        retData = baseCore.uptoOBS(pdf_url, title, 15, '', pathType, taskType, start_time, create_by)
+        num += 1
+        id_list = []
+        if retData['state']:
+            att_id = tableUpdate(retData, title, num, standard_time, '经济合作与发展组织')
+            if att_id:
+                id_list.append(att_id)
+                now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+                lang = baseCore.detect_language(content)
+
+                contentRaw = contentWithTagRaw.text
+                contentWithTagRaw = str(contentWithTagRaw)
+                dic = {
+                    'id': f'1620244462491893761{int(time.time())}',
+                    'subjectId': '1620244462491893761',
+                    'checkStatus': 1,
+                    'deleteFlag': 0,
+                    'topNum': 0,
+                    'content': content,
+                    'contentRaw': contentRaw,
+                    'contentWithTag': str(contentWithTag),
+                    'contentWithTagRaw': contentWithTagRaw,
+                    'createDate': now,
+                    'labels': [
+                        {'labelMark': 'organization', 'relationId': '1619903523269271554', 'relationName': '经济合作与发展组织'}],
+                    'lang': lang,
+                    'origin': '经济合作与发展组织',
+                    'publishDate': standard_time,
+                    'sourceAddress': href,
+                    'title': title,
+                    'titleRaw': titleRaw,
+                    'updateDate': now,
+                    'attachmentIds':id_list
+                }
+                sendKafka(dic)
+                try:
+                    save_data(dic)
+                except:
+                    log.error(f'{href}===数据库保存失败')
+        # break
+
+if __name__ == "__main__":
+    pathType = 'PolicyDocuments/'
+    taskType = '国外智库-经合组织'
+    create_by = 'XueLingKun'
+
+    doJob()
+
--- a/qiushi_leaderspeech.py
+++ b/qiushi_leaderspeech.py
@@ -119,17 +119,17 @@ if __name__=='__main__':
                    #     or '中共' in author or '记者' in author or '新闻社' in author\
                    #     or '党委' in author or '调研组' in author or '研究中心' in author\
                    #     or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
-                    # if '（' in author or '本刊' in author  \
-                    #         or '记者' in author or '新闻社' in author \
-                    #         or '”' in author\
-                    #         or '大学' in author or '洛桑江村' in author:
-                    #     continue
-                    if '国资委党委' in author:
-                        pass
-                    else:
+                    if '（' in author or '本刊' in author  \
+                            or '记者' in author or '新闻社' in author \
+                            or '”' in author\
+                            or '大学' in author or '洛桑江村' in author:
                        continue
+                    # if '国资委党委' in author:
+                    #     pass
+                    # else:
+                    #     continue
                    new_href = new.find('a')['href']
-                    is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
+                    is_member = r.sismember('qiushileaderspeech::' + period_title, new_href)
                    if is_member:
                        continue
                    new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
@@ -165,7 +165,7 @@ if __name__=='__main__':
                }
                log.info(dic_news)
                if sendKafka(dic_news):
-                    r.sadd('qiushileaderspeech_two::' + period_title, new_href)
+                    r.sadd('qiushileaderspeech::' + period_title, new_href)
                    log.info(f'采集成功----{dic_news["sourceAddress"]}')


--- a/shenji/sclx.py
+++ b/shenji/sclx.py
+import csv
 import time

-import pandas as pd
+import redis
 import requests
 from bs4 import BeautifulSoup
 from retry import retry
+from selenium.common import StaleElementReferenceException

 from base import BaseCore
 from requests.packages import urllib3
+from selenium.webdriver.common.by import By
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC

 urllib3.disable_warnings()
 baseCore = BaseCore.BaseCore()
 log = baseCore.getLogger()
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
 headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br',
@@ -33,54 +40,100 @@ headers = {
    'sec-ch-ua-platform': '"Windows"',
 }

+# todo:使用模拟浏览器
+def create_driver():
+    path = r'D:\soft\msedgedriver.exe'

-@retry(tries=2, delay=5)
-def getHref(Keywords):
-    data = {
-        'Menu': 'law',
-        'Keywords': Keywords,
-        'PreKeywords': Keywords,
-        'SearchKeywordType': 'Title',
-        'MatchType': 'Exact',
-        'RangeType': 'Piece',
-        'Library': 'chl',
-        'ClassFlag': 'chl',
-        'GroupLibraries': '',
-        'QuerySearchCondition': 'Title+Exact+Piece+0',
-        'QueryOnClick': False,
-        'AfterSearch': True,
-        'RequestFrom': 'btnSearch',
-        'SearchInResult': '',
-        'PreviousLib': 'chl',
-        'IsSynonymSearch': 'false',
-        'RecordShowType': 'List',
-        'ClassCodeKey': ',,,,,,',
-        'IsSearchErrorKeyword': '',
-        'FirstQueryKeywords': Keywords,
-        'FirstQueryKeywordType': 'Title',
-        'IsSynonymSearch': 'false',
-        'X-Requested-With': 'XMLHttpRequest',
+    # options = webdriver.EdgeOptions()
+    options = {
+        "browserName": "MicrosoftEdge",
+        "ms:edgeOptions": {
+            "extensions": [], "args": ["--start-maximized"]  # 添加最大化窗口运作参数
        }
-    ip = baseCore.get_proxy()
-    url = 'https://sclx.pkulaw.com/law/chl'
-    req = requests.get(url, headers=headers, data=data, proxies=ip, verify=False)
-    req.encoding = req.apparent_encoding
-    soup = BeautifulSoup(req.text, 'html.parser')
+    }
+
+    driver = webdriver.Edge(executable_path=path, capabilities=options)
+    return driver
+
+
+@retry(tries=2, delay=5)
+def getHref(Keywords, driver):
+    # data = {
+    #     'Menu': 'law',
+    #     'Keywords': Keywords,
+    #     'PreKeywords': Keywords,
+    #     'SearchKeywordType': 'Title',
+    #     'MatchType': 'Exact',
+    #     'RangeType': 'Piece',
+    #     'Library': 'chl',
+    #     'ClassFlag': 'chl',
+    #     'GroupLibraries': '',
+    #     'QuerySearchCondition': 'Title+Exact+Piece+0',
+    #     'QueryOnClick': False,
+    #     'AfterSearch': True,
+    #     'RequestFrom': 'btnSearch',
+    #     'SearchInResult': '',
+    #     'PreviousLib': 'chl',
+    #     'IsSynonymSearch': 'false',
+    #     'RecordShowType': 'List',
+    #     'ClassCodeKey': ',,,,,,',
+    #     'IsSearchErrorKeyword': '',
+    #     'FirstQueryKeywords': Keywords,
+    #     'FirstQueryKeywordType': 'Title',
+    #     'IsSynonymSearch': 'false',
+    #     'X-Requested-With': 'XMLHttpRequest',
+    # }
+    driver.get('https://sclx.pkulaw.com/law')
+    # ip = baseCore.get_proxy()
+    driver.find_element(By.ID, 'txtSearch').send_keys(Keywords)
+    time.sleep(0.5)
+    driver.find_element(By.CLASS_NAME, 'btn-search').click()
+    wait = WebDriverWait(driver, 30)
+    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "accompanying-wrap")))
+    getpart = driver.find_element(By.CLASS_NAME, 'accompanying-wrap')
+    # li_list = getpart.find_elements(By.TAG_NAME, 'li')
+    # for li in li_list:
+    driver.execute_script("arguments[0].scrollIntoView();", getpart)
+    time.sleep(2)
    try:
-        tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
-            'name': 'HistoryAssociation'})
-        href = 'https://sclx.pkulaw.com' + tag.get('url')
-    except:
-        href = ''
+        element = getpart.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
+        time.sleep(1)
+        driver.execute_script("arguments[0].scrollIntoView();", element)
        time.sleep(1)
+        element.click()
+        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
+        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "a-tab-col")))
+        info_part = driver.find_element(By.CLASS_NAME, 'a-tab-col').find_element(By.XPATH, './/div[@name="HistoryAssociation"]')
+    # except Exception as e:
+    except StaleElementReferenceException:
+        # 元素已经stale，重新定位元素
+        element = driver.find_element(By.XPATH, ".//div/div[1]/div[3]/div/div[1]/ul/li[@name='HistoryAssociation']")
+        element.click()  # 再次尝试与元素交互
+        href = 'https://sclx.pkulaw.com' + element.get_attribute("url")
+        # log.info(e)
+        # href = ''
    return href

+    # url = 'https://sclx.pkulaw.com/law/chl'
+    # req = requests.post(url, headers=headers, data=data, proxies=ip)
+    # req = requests.post(url, headers=headers, data=data, verify=False)
+    # req.encoding = req.apparent_encoding
+    # soup = BeautifulSoup(req.text, 'html.parser')
+    # try:
+        # tag = soup.find('div', class_='accompanying-wrap').find('div', class_='item').find('li', attrs={
+        #     'name': 'HistoryAssociation'})
+        # href = 'https://sclx.pkulaw.com' + tag.get('url')
+    # except:
+    #     href = ''
+    # return href

-@retry(tries=2, delay=5)
-def getData(href):
-    data = []
-    ip = baseCore.get_proxy()
-    req = requests.get(href, headers=headers, proxies=ip, verify=False)
+
+@retry(tries=3, delay=5)
+def getData(href, Keywords):
+    term = Keywords
+    # ip = baseCore.get_proxy()
+    # req = requests.get(href, headers=headers, proxies=ip)
+    req = requests.get(href, headers=headers)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    li_list = soup.find_all('li')
@@ -90,22 +143,59 @@ def getData(href):
            theme = li.find('div', class_='theme').text.strip()
        except:
            theme = ''
-        try:
-            relevance = li.find('div', class_='relevance').text.strip()
-        except:
-            relevance = ''
-        data.append([publishDate,theme,relevance])
-    time.sleep(1)
-    return data
+        # try:
+        #     relevance = li.find('div', class_='relevance').text.strip()
+        # except:
+        #     relevance = ''
+        # log.info(f'{publishDate}==={theme}==')
+        term += ',' + theme + '_' + publishDate
+    log.info(term)
+    if ',' not in term or '_' not in term:
+        r.rpush('ShenjisclxError:', Keywords)
+        return None
+    return term
+

 def doJob():
-    data = []
-    Keywords = '中华人民共和国公司法(2023修订)'
-    href = getHref(Keywords)
+    data_list = []
+    driver = create_driver()
+    driver.maximize_window()
+
+    while True:
+        try:
+            Keywords = r.lpop('Shenjisclx:').decode()
+            # Keywords = '中华人民共和国银行业监督管理法（2006修正）'
+        except:
+            Keywords = ''
+        if Keywords:
+            try:
+                href = getHref(Keywords, driver)
                if href:
-        data += getData(href)
-    df = pd.DataFrame(data)
-    print(df)
+                    r.rpush('ShenjisclxHref:', f'{Keywords}|{href}')
+                    log.info(f'{Keywords}====找到=== {href}')
+                    term = getData(href, Keywords)
+                else:
+                    term = Keywords + ','
+                    r.rpush('ShenjisclxHrefNull:', f'{Keywords}|{href}')
+                    log.info(f'{Keywords}====未找到')
+                if term:
+                    # data_list.append(term)
+                    r.rpush('ShenjisclxReault:', term)
+            except:
+                r.rpush('ShenjisclxError:', Keywords)
+                continue
+            time.sleep(2)
+        else:
+            break
+    # print(data_list)
+    # with open('./output.csv', 'w', newline='') as file:
+    #     writer = csv.writer(file)
+    #
+    #     # 写入数据
+    #     for row in data_list:
+    #         writer.writerow(row.split(','))
+    #
+    # print('数据已成功写入CSV文件')

 if __name__ == '__main__':
    doJob()

--- a/shenji/test.py
+++ b/shenji/test.py
+import csv
+
+# 要写入的数据
+# data = [
+#     ['Name', 'Age', 'City'],
+#     ['Alice', 25, 'New York'],
+#     ['Bob', 30, 'Los Angeles'],
+#     ['Charlie', 35, 'Chicago']
+# ]
+
+
+data = ['aaaa,bbbb,cccc', 'aaaa,cccc,ffff']
+# 打开CSV文件进行写入
+with open('./output.csv', 'w', newline='') as file:
+    writer = csv.writer(file)
+
+    # 写入数据
+    for row in data:
+        writer.writerow(row.split(','))
+
+print('数据已成功写入CSV文件')
--- a/shenji/write_tocsv.py
+++ b/shenji/write_tocsv.py
+import csv
+
+import redis
+
+r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+if __name__ == "__main__":
+    with open('./output0220_1.csv', 'w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        while True:
+            try:
+                term_ = r.lpop('ShenjisclxError:').decode()
+                term = str(term_) + ','
+            except:
+                term = ''
+            if term == '':
+                break
+            else:
+                # 写入数据
+                writer.writerow(str(term).split(','))
+
+
+
--- a/test.py
+++ b/test.py
@@ -170,5 +170,71 @@ for data in datas:
    #     f.write(dic_info_)
    # break
    # req = requests.post('http://192.168.1.236:5000/translate',data=dic_info_,headers=headers)
-    req = requests.post('http://117.78.23.14:5001/translate',data=dic_info_,headers=headers)
+    req = requests.post('http://117.78.23.14:5000/translate',data=dic_info_,headers=headers)
    log.info(req.text)
+
+# import re, datetime
+#
+#
+# def paserTime(publishtime):
+#     timeType = ['年前', '月前', '周前', '前天', '昨天', '天前', '今天', '小时前', '分钟前']
+#     current_datetime = datetime.datetime.now()
+#     publishtime = publishtime.strip()
+#     print(publishtime)
+#
+#     try:
+#         if '年前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(days=365 * day)
+#             publishtime = current_datetime - delta
+#         elif '月前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(months=day)
+#             publishtime = current_datetime - delta
+#         elif '周前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(weeks=day)
+#             publishtime = current_datetime - delta
+#         elif '天前' in publishtime:
+#             numbers = re.findall(r'\d+', publishtime)
+#             day = int(numbers[0])
+#             delta = datetime.timedelta(days=day)
+#             publishtime = current_datetime - delta
+#         elif '前天' in publishtime:
+#             delta = datetime.timedelta(days=2)
+#             publishtime = current_datetime - delta
+#         elif '昨天' in publishtime:
+#             current_datetime = datetime.datetime.now()
+#             delta = datetime.timedelta(days=1)
+#             publishtime = current_datetime - delta
+#         elif '今天' in publishtime or '小时前' in publishtime or '分钟前' in publishtime:
+#             if '小时' in publishtime:
+#                 hour = publishtime.split("小时")[0]
+#             else:
+#                 hour = 0
+#             if hour != 0:
+#                 min = publishtime.split("小时")[1].split("分钟")[0]
+#             else:
+#                 min = publishtime.split("分钟")[0]
+#
+#             delta = datetime.timedelta(hours=int(hour), minutes=int(min))
+#             publishtime = current_datetime - delta
+#         elif '年' in publishtime and '月' in publishtime:
+#             time_format = '%Y年%m月%d日'
+#             publishtime = datetime.datetime.strptime(publishtime, time_format)
+#         elif '月' in publishtime and '日' in publishtime:
+#             current_year = current_datetime.year
+#             time_format = '%Y年%m月%d日'
+#             publishtime = str(current_year) + '年' + publishtime
+#             publishtime = datetime.datetime.strptime(publishtime, time_format)
+#     except Exception as e:
+#         print('时间解析异常！！')
+#     return publishtime
+#
+# if __name__ == "__main__":
+#     publishtime_ = '1小时17分钟前'
+#     publish_time = paserTime(publishtime_).strftime("%Y-%m-%d")
+#     print(publish_time)
\ No newline at end of file
--- a/习近平讲话/1.py
+++ b/习近平讲话/1.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -59,12 +59,13 @@ def newsdata(art_content_dict,art_type_dict,dic_lables):
        try:
            del post_dict['is_repeat']
            del post_dict['tags']
+            del post_dict['title_pd']
            # 发送kafka
-            producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
-            kafka_result = producer.send("research_center_fourth",
-                                         json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
-
-            print(kafka_result.get(timeout=10))
+            # producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024 * 1024 * 20)
+            # kafka_result = producer.send("research_center_fourth",
+            #                              json.dumps(post_dict, ensure_ascii=False).encode('utf8'))
+            #
+            # print(kafka_result.get(timeout=10))

            dic_result = {
                'success': 'ture',
@@ -122,21 +123,22 @@ def get_content():
        except:
            print('请求错误1')
            continue
-        for data_dict in data_list[::-1]:
+        # for data_dict in data_list[::-1]:
+        for data_dict in data_list[:1]:
            article_id = data_dict['article_id']
            print(type(article_id))
-            is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
-            if is_article_id:
-                continue
-            title = data_dict['title']
+            # is_article_id = db_storage.find_one({'id': f"1534423014825668610{article_id}"})
+            # if is_article_id:
+            #     continue
+            title = data_dict['title'] # 采集到的标题
            pub_time = data_dict['input_date']
            current_date = datetime.now()
            yesterday = current_date - timedelta(days=1)
            # 格式化日期
            yesterday_date = yesterday.strftime("%Y-%m-%d")
-            if pub_time <= yesterday_date:
-                continue
-            title_dict_list = db_storage.find({'title': title, 'is_repeat': ''})
+            # if pub_time <= yesterday_date:
+            #     continue
+            title_dict_list = db_storage.find({'title_pd': title.replace(' ', ''), 'is_repeat': ''})  # 如果找到一样的标题 判断三天之内是否有重复的
            is_repeat = ''
            for title_dict in title_dict_list:
                pub_time1 = title_dict['publishDate']
@@ -152,6 +154,14 @@ def get_content():
                doc_href = pq(href_text)
                content_html1 = str(doc_href('.d2txt_con.clearfix'))
                content_html2 = str(doc_href('.editor.clearfix'))
+                #rtodo: 找到标题并拼接
+                title1 = doc_href('.d2txt.clearfix h2').text()
+                title2 = doc_href('.d2txt.clearfix h1').text()
+                title3 = doc_href('.d2txt.clearfix h3').text()
+                if title1 == '' and title3 == '':
+                    title_final = title
+                else:
+                    title_final = title1 + ' ' + title2 + ' ' + title3
            except:
                print('请求错误2')
                continue
@@ -170,7 +180,8 @@ def get_content():
            origin = data_dict['origin_name']
            a_dict = {
                'id': "1534423014825668610" + article_id,
-                'title': title,
+                'title': title_final,
+                'title_pd': title,
                'author': '',
                'origin': origin,
                'contentWithTag': content_html,
@@ -183,6 +194,7 @@ def get_content():
            }
            art_content_dict[article_id] = a_dict
            db_a_dict = a_dict.copy()
+            db_a_dict['title_pd'] = title.replace(' ', '')
            db_storage.insert_one(db_a_dict)
            if is_repeat == '':
                print(href)

--- a/习近平讲话/datasfromes.py
+++ b/习近平讲话/datasfromes.py
+"""
+"""
+从es中拿到所有的标题
+"""
+import redis
+from elasticsearch import Elasticsearch
+from base import BaseCore
+
+baseCore = BaseCore.BaseCore()
+log = baseCore.getLogger()
+
+class EsMethod(object):
+
+    def __init__(self):
+        # 创建Elasticsearch对象，并提供账号信息
+        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
+        self.index_name = 'subjectdatabase'
+
+    def queryatt(self,index_name,pnum):
+       body = {
+           "query": {
+               "match": {
+                   "subjectId": "1534423014825668610"
+               }
+           },
+           "sort": [
+               {
+                   "publishDate": {
+                       "order": "desc"
+                   }
+               }
+           ],
+           "track_total_hits": True,
+           "size": 200,
+           "from": pnum
+       }
+
+       filter_path = ['hits.hits._id',
+                      'hits.total.value',
+                      'hits.hits._source.title',
+                      'hits.hits._source.origin',
+                      'hits.hits._source.publishDate',
+                      ]  # 字段2
+       result = self.es.search(index=index_name
+                               , doc_type='_doc'
+                               , filter_path=filter_path
+                               , body=body)
+       # log.info(result)
+       return result
+
+
+
+if __name__ == '__main__':
+    es_method = EsMethod()
+    # 连接Redis
+    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+    for i in range(56):
+        result = es_method.queryatt('subjectdatabase', i*200)
+        total = result['hits']['total']['value']
+        try:
+            msglist = result['hits']['hits']
+        except:
+           log.info(f'error-----{result}')
+           continue
+        log.info(f'---第{i}页{len(msglist)}条数据----共{total}条数据----')
+
+        for mms in msglist:
+           id = mms['_id']
+           title = mms['_source']['title']
+           origin = mms['_source']['origin']
+           pub_time = mms['_source']['publishDate']
+           try:
+               log.info(f'{id}--{title}--{origin}--')
+               item = id + "|" + pub_time
+               # r.lrem(f'XJPdatabase:id_2', 0, item)
+               r.lpush(f'XJPdatabase:id', item)
+           except:
+               continue
--- a/习近平讲话/title.py
+++ b/习近平讲话/title.py
+"""
+"""
+对标题进行操作
+1.有空格的去掉空格
+2.精确去重
+3.杰卡德相似度去重
+"""
+#将数据读到csv中
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+
+df = pd.read_excel('./test2.xlsx')
+print(df)
+# 去掉空格
+df['title_1'] = df['title'].str.replace(' ', '')
+print(df['title_1'])
+#精确去重
+# df_drop = df.drop_duplicates(subset=['title'], keep='first')
+# duplicates = df[df.duplicated('title_1', keep=False)]['title_1']
+
+#杰卡德相似度去重
+# from sklearn.feature_extraction.text import TfidfVectorizer
+# vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),max_features=1000)
+# tfidf_matrix = vectorizer.fit_transform(df['title'])
+#
+# dist = 1 - cosine_similarity(tfidf_matrix)
+#
+# df['similar'] = dist.mean(axis=1)
+#
+# df_drop = df.drop_duplicates(subset=['title'],keep='last')
+
+# df_drop.to_csv('D:/data/titles_drop.csv',index=False)
\ No newline at end of file
--- a/百度采集/baidu_comm/baidutaskJob_loc.py
+++ b/百度采集/baidu_comm/baidutaskJob_loc.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -163,9 +163,8 @@ class BaiduTaskJob(object):

        return kwList

-    def runSpider(self,kwmsg):
-
-        searchkw=kwmsg['kw']
+    def runSpider(self,kwmsg,com_name):
+        searchkw=com_name + kwmsg['kw']
        wordsCode=kwmsg['wordsCode']
        sid=kwmsg['sid']
        baiduSpider=BaiduSpider(searchkw,wordsCode,sid)
@@ -186,7 +185,15 @@ class BaiduTaskJob(object):
            finally:
                baiduSpider.driver.quit()
        logger.info("关键词采集结束！"+searchkw)
-import random
+
+    def get_comname(self):
+        # todo:读取redis里的企业名称添加到关键词上
+        com_name = baseCore.redicPullData('SousuoBaidu:companyname')
+        if com_name:
+            return com_name
+        else:
+            logger.info('====已无企业===')
+            return None

 def PutWords(codeList, r):

@@ -208,50 +215,17 @@ if __name__ == '__main__':
    baseCore=BaseCore()
    logger=baseCore.getLogger()
    # ss='(中国机床工具工业协会|中国内燃机工业协会|中国机电工业价格协会|中国机械电子兵器船舶工业档案学会|中国仪器仪表行业协会|中国工程机械工业协会|中国文化办公设备制造行业协会|中国机械工业金属切削刀具技术协会|中国机械工业教育协会|中国汽车工业协会|中国机械通用零部件工业协会|中国环保机械行业协会|中国模具工业协会|中国机械工业勘察设计协会|中国机械制造工艺协会|中国机械工业审计学会|中国轴承工业协会|中国机电一体化技术应用协会|中国机械工程学会|中国液压气动密封件工业协会|中国铸造协会|中国通用机械工业协会|中国锻压协会|中国制冷空调工业协会|中国热处理行业协会|中国电工技术学会|中国仪器仪表学会|中国石油和石油化工设备工业协会|中国表面工程协会|中国食品和包装机械工业协会|中国焊接协会|中国汽车工程学会|中国塑料机械工业协会|中国机械工业企业管理协会|中国印刷及设备器材工业协会|中国机械工业质量管理协会|中国电器工业协会|中国机械工业安全卫生协会|中国重型机械工业协会|中国机械工业标准化技术协会|中国机械工业职工思想政治工作研究会|中国农业机械工业协会|中国机电装备维修与改造技术协会 |机械工业信息研究院|机械工业教育发展中心|机械工业经济管理研究院|机械工业信息中心|机械工业人才开发服务中心|机械工业北京电工技术经济研究所|机械工业技术发展基金会|机械工业哈尔滨焊接技术培训中心|机械工业仪器仪表综合技术经济研究所)+(私收会费|私吞|肆意牟利|损失浪费|索贿|贪财|贪官污吏|贪污|违背组织原则|违法|违纪|为官不廉|为政擅权|窝案|舞弊|泄露国家机密|信鬼神|性关系|虚假信息|虚假招标|隐瞒不报|隐瞒真相|营私|鬻爵|主动投案|资产流失|钻空子|钻漏洞|被调查|被双开|不担当|不老实|不良影响|不正当|不作为|超标准建设|超标准装修|吃空饷|吃拿卡要|渎职|对党不忠诚|非法批地|腐败|腐虫|腐化堕落|公车私用|公费开销|公款吃喝|公款出境|公款旅游|勾结|官迷心窍|好色|回扣|贿赂|挤占挪用|纪律审查|监察调查|监守自盗|践踏法律|接受审查调查|截留克扣|开除党籍|开除公职|抗议|利欲熏心|敛财|乱摊派|乱作为|落马|落网|买官|买卖审批权限|卖官|谋取暴利|谋取私利|目无法纪|幕后交易|弄虚作假|挪用公款|骗取|钱色交易|潜规则|侵害权益|侵吞公款|侵占挪用|圈子文化|权利扭曲|权钱交易|权色交易|山头主义|涉案|生活糜烂|生活奢靡|失察|失管|收送|受贿|双规|双开|私分|私人会所|私设小金库|负面|下降|违规|不利|亏损|上诉|不法|不良名单|停职|公开谴责|公诉|内幕交易|刑事拘留|刑事责任|刑拘|判决|判刑|判赔|司法处置|合同纠纷|处分|处罚|强制执行|仲裁|伪造|伪造公章|投案|投诉|拘留|接受调查|控诉|查封|涉嫌|涉诉监察调查|纠纷|经营异常名录|缉捕|罚单|罚款|罚金|罪犯|自首|获刑|行贿|警示函|贪腐|违约金|追究刑责|造假|逮捕|非法|非法集资判决书|申诉|纠纷|通报|开除|留党察看|追债|逃债|资产负债率|情色交易|搞权钱|曝光|黑料|重罚|虚假报告|侵犯)'
-    # keymsglist=baiduTaskJob.getkeywords(ss)
-    # print(keymsglist)
-    # 创建Redis连接
-    # r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
-    # codeList = [
-    #     'KW-20220809-0004',
-    #     'KW-20220524-0004',
-    #     'KW-20220809-0005',
-    #     'KW-20220824-0001',
-    #     'KW-20220809-0002',
-    #     'KW-20220809-0003',
-    #     'KW-20220826-0001',
-    #     'KW-20220602-0003',
-    #     'KW-20220602-0002',
-    #     'KW-20220113-0007',
-    #     'KW-20220113-0006',
-    #     'KW-20220108-0004',
-    #     'KW-20220113-0004'
-    # ]
-    # PutWords(codeList, r)
-
    while True:
        try:
-           #  codeid = redicPullData("BaiduSearch:WordsCode", r)
-           #  if codeid:
-           #      pass
-           #  else:
-           #      PutWords(codeList, r)
-           # #codeList.append('KW-20220108-0004')
-           #  logger.info(f'开始采集{codeid}')
+            com_name = baiduTaskJob.get_comname()
+            if com_name:
+                pass
+            else:
+                break
            codeList = [
-               # 'KW-20220809-0004',
-               # 'KW-20220524-0004',
-               # 'KW-20220809-0005',
-               # 'KW-20220824-0001',
-               # 'KW-20220809-0002',
-               # 'KW-20220809-0003',
-               'KW-20220826-0001',
-               # 'KW-20220602-0003',
-               # 'KW-20220602-0002',
-               # 'KW-20220113-0007',
-               # 'KW-20220113-0006',
-               # 'KW-20220108-0004',
-               # 'KW-20220113-0004'
+               'KW-20240206-0001',
+               'KW-20240206-0002',
+               'KW-20240206-0003'
           ]
            for codeid in codeList:
                try:
@@ -271,7 +245,7 @@ if __name__ == '__main__':
                    # 创建一个线程池，指定线程数量为4
                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                        # 提交任务给线程池，每个任务处理一个数据
-                        results = [executor.submit(baiduTaskJob.runSpider, data) for data in kwList]
+                        results = [executor.submit(baiduTaskJob.runSpider, data,com_name) for data in kwList]
                        # 获取任务的执行结果
                        for future in concurrent.futures.as_completed(results):
                            try:

--- a/百度采集/baidu_comm/baseCore.py
+++ b/百度采集/baidu_comm/baseCore.py
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
@@ -7,6 +7,7 @@ import logbook
 import logbook.more
 # 核心工具包
 import pymysql
+import redis
 from tqdm import tqdm
 # 注意 程序退出前 调用BaseCore.close() 关闭相关资源
 class BaseCore:
@@ -215,6 +216,8 @@ class BaseCore:
        except :
            pass
    def __init__(self):
+        # 连接到Redis
+        self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
        self.__cnx_proxy = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project',
                                           charset='utf8mb4')
        self.__cursor_proxy= self.__cnx_proxy.cursor()
@@ -288,65 +291,11 @@ class BaseCore:
    def getRandomUserAgent(self):
        return random.choice(self.__USER_AGENT_LIST)

-    # 获取代理
-    def get_proxy(self):
-        sql = "select proxy from clb_proxy"
-        self.__cursor_proxy.execute(sql)
-        proxy_lists = self.__cursor_proxy.fetchall()
-        self.__cnx_proxy.commit()
-        ip_list = []
-        for proxy_ in proxy_lists:
-            ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
-        proxy_list = []
-        for str_ip in ip_list:
-            str_ip_list = str_ip.split('-')
-            proxyMeta = "http://%(host)s:%(port)s" % {
-                "host": str_ip_list[0],
-                "port": str_ip_list[1],
-            }
-            proxy = {
-                "http": proxyMeta,
-                "https": proxyMeta
-            }
-            proxy_list.append(proxy)
-        return proxy_list[random.randint(0, 3)]
-
-    def get_proxy(self):
-        ip_list = []
-        with self.__cursor_proxy as cursor:
-            sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
-            print(sql_str)
-            cursor.execute(sql_str)
-            rows = cursor.fetchall()
-            for row in tqdm(rows):
-                str_ip = row[0]
-                str_ip_list = str_ip.split('-')
-                proxyMeta = "http://%(host)s:%(port)s" % {
-                    "host": str_ip_list[0],
-                    "port": str_ip_list[1],
-                }
-                proxy = {
-                    "HTTP": proxyMeta,
-                    "HTTPS": proxyMeta
-                }
-                ip_list.append(proxy)
-
-        return  ip_list
-    def get_proxyIPPort(self):
-        ip_list = []
-        with self.__cursor_proxy as cursor:
-            sql_str = '''select PROXY from clb_proxy where id={} '''.format(random.randint(1, 12))
-            print(sql_str)
-            cursor.execute(sql_str)
-            rows = cursor.fetchall()
-            for row in tqdm(rows):
-                str_ip = row[0]
-                str_ip_list = str_ip.split('-')
-                proxy = {
-                    "host": str_ip_list[0],
-                    "port": str_ip_list[1],
-                }
-
-                ip_list.append(proxy)
-
-        return  ip_list
\ No newline at end of file
+    # 从Redis的List中获取并移除一个元素
+    def redicPullData(self, key):
+        try:
+            self.r.ping()
+        except:
+            self.r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=6)
+        item = self.r.lpop(key)
+        return item.decode() if item else None
\ No newline at end of file