Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
Z
zzsn_spider
概览
概览
详情
活动
周期分析
版本库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
统计图
问题
0
议题
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
代码片段
成员
成员
折叠边栏
关闭边栏
活动
图像
聊天
创建新问题
作业
提交
问题看板
Open sidebar
丁双波
zzsn_spider
Commits
4e84d611
提交
4e84d611
authored
10月 26, 2023
作者:
薛凌堃
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
10/26
上级
c2749092
显示空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
367 行增加
和
46 行删除
+367
-46
RedisPPData.py
base/RedisPPData.py
+6
-3
smart_extractor.py
base/smart/smart_extractor.py
+15
-2
getTycId.py
comData/Tyc/getTycId.py
+4
-2
newsbucai.py
comData/Tyc/newsbucai.py
+38
-7
雪球网-年报.py
comData/annualReport/雪球网-年报.py
+3
-3
creditchina.py
comData/negative_news/creditchina.py
+131
-26
tycdt.py
comData/tyctest/tycdt.py
+2
-2
get_tokenCookies.py
comData/weixin_solo/get_tokenCookies.py
+1
-1
qiushi_leaderspeech.py
qiushi_leaderspeech.py
+167
-0
没有找到文件。
base/RedisPPData.py
浏览文件 @
4e84d611
...
...
@@ -66,12 +66,15 @@ def NewsEnterprise():
# gw_social_list = [item[0] for item in gw_result]
#todo:打印长度
# print(len(gw_social_list))
gn_social_list
=
[
item
[
0
]
for
item
in
gn_result
]
# gn_social_list = [item[0] for item in gn_result]
aaa
=
'91320500703676365K,91210000242666665H,9111010810114488XN,91110000633715962Q,91442000618120215D,9112000013270080XR,91330200144069541X,911100006004827014,91310115703490552J,91441300791200462B,914405007254810917,91320400608117856C,91310000607311067X,913101156822157531,91320200673924654N,9144010173974661X9,912102002423997128,91530000719480244Y,913300001469343082,911100007715928418,911100001013053805,91330784771942047G,91330000704206605E,91310000631658829P,91370000163044841F,91330481673858589X,91330100143779306C,91310000566558740F,91310000729394470F,91320583753203830Q,91320214MA1MLB3M2A,91441900745512430D,9131000070322836XD,91110108565775188E,91330382MA28657U43,91440500192754762W,913700007306705753,913700007456765902,913700006135617321,913707007884641257,91320200250456967N,913710007823210514,91371000680666729L,913710002642503020,91371000166734784G,9137100061375530XH,91370000735783157F,91371000264190434B,91370600669347065T,91440300192287527J,91440101759431420M,91420100731042634N,91440600193813525E,91445300707813507B,91340700719911235R,91330000256018570F,91330300609381595H,91330326766445257X,91310000741167473L,91650000722367867J,91610000719782242D,916101327428232411,91360500698460390M,91340100610307130N,913205006082757232,91430100760727392G,912103001190699375,91340100740872226E,914406007583005174,91420000706811358X,91440300723009295R,913100007653010244,91310000759874061E,91310000734047094H,91130100679916292B,913200001347865204,91330000668325921R,91540195754285145H,913206007691214935,914201006634595767,914201003001005805,9142010066953862X0,91420100771373833D,91420100724667038L,9142010073106629X0,91420100682300843F,914200006164176058,914201003002476548,91420111783183308C,91420100568359390C,91420100799777098J,914200007146087391,9142010073104498XQ,91420000177730287E,914201007071163060,91420100300053761B,914201005550145025,91420100731084500Y,913402007408704905,91340200762794150A,913402005785489960,913200006082793884,91420100300251645N,913205001381896946,9132020071491965XM,91320200724183068U,913202005502754040,91320200240505438W,91320200725201811T,91320200136349770P,91320200782736492H,91320211697939236T,91320282559266993J,91320200743938892R,91320200738287183E,913202007974023051,91320200060166715B,9132020072665605XK,91320200725202347B,91320200755862928J,91320205250220911F,91320200735716149R,91320200704046760T,91320200763551927E,913202000601816164,913202006816377193,913202110535042298,91320211790871547J,91320200775435667T,91320200732272706G,913202137186955428,91320200135890776N,91320200692568341T,91320211769137321E,91320205757981568F,91320200674440635K,913202006811293789,91320200741311914F,91320211250066467M,91440300770347406Y,914403007311028524,915101000833108553,916101315660088532,913502002600603688,9135020026013710XM,91350200260060034P,9135020061200896X6,913502002600846346,91350200612016388E,91350200751606855K,913502005750038518,91350200612033399C,913502007054071347,91350200612260049W,91350200155013367M,91350200260120674H,913502007054371227,91350200705466767W,91610103294241917P,91610100294239534G,9161010322061133XP,91610131628053714D,91610131628001738N,916101317974808482,91330500778299605T,91420000177583897Q,91420600714657151N,914105277474012089,913301006739591016,91440605053745575B,91330000143011639A,91440500231666168R,91350200784171077C,91650100228582422A,913300007195926252,91370600720751371J,91350505705245753B,91350000154341545Q,91330000609124409H,91330000142941287T,91110000710927126K,916300002265939457,916500007129676234,91650000722318862K,916502006978024838,916523007269460306,916529007846613320,91210100117812926M,91652800715548301W,91650000712966815D,91650100076066559G,916501004576329996,916501006827031595,91653100748663541B,916501007383763383,916542002035688270,91650100228584428B,91650200729156392B,91650000673412317X,91331081797615327C,91330300145498305G,914403007451834971,91411000780502633Q,91410724671699465T,914107006149375190,9141070068568407XM,91130400104365768G,9133038214550201X5,91420800764100001A,91330000148247018R,91320300750041506E,911500007014638920,91370000184280878R,91341800779082563U,91411000772168241N,91330200734267003C,913203007974062428,91430900750606108C,913206007448277138,91610403338742407M,91350000158164371W,91420100616400352X,913210001410496687,913210036087333842,91321003725216976F,913210007908906337,9143010077900133XT,91370000166122374N,913700007337235643,91370600752684994W,91370600737203697G,91370000720717309H,913706006894842353,91370000613431903J,913706837381687230,9137000016503468XK,913706007063003983,91321000140719551F,91370300706025381Q,91441900618333632H,912201068239984307,91210000594843987K,915115002088509874,91330000725254155R,9143070067558223X2,913100007178563164,914405001959930485,91440000618095689N,91654002564379263N,91510100740327535Y,91320200704074497B,91640000227693286K,91330100749453087D,91441881755600266B,91210000716409709T,91421200562722881P,91510900754701583H,911525007116525588,91330000745826157T,91530000725287862K,91440606740846335Y,91350200612040492E,91350000727900106T,91330204746303411D,91330000722762533U,91350500583130113U,91430000760723375M,91310000748059571K,91110000600001760P,91350000611569108K,913310007530185122,91330000743452075L,911100007263731643,9132000072058020XK,91330200704800698F,914401017083874153,91340100610307675N,913204005603281353,91500112660889685L,912102002412697996,91330100720048715X,9161000006191747XU,913707007465823505,914306007225877126,914300007656254831,91330400551779794Q,915301007134092367,915301005551100783,91530000218920600L,91530000713464526C,91530000727317703K,915300007670584000,91530000709835283M,915300002194829991,915300007098268547,915300002179235351,91530000713401509F,915300007134134380,91410000170001401D,91610133132207011Q,9113040060115569X8,91330000723629902K,91340100327991758Q,91310000586778185R,91430800186881407B,91330183679858889H,91320582714943959W,913205007514219819,91320582790874377A,91320582729023768R,913205007185439367,913310211483889459,9133000074507862XQ,9133070314730958XR,9133000072527923XB,91330000759522947D,91330600146342118G,9133000076251901XW,91331000554754592X,91330000609786138W,913310005753258189,91330481799606731M,91330000777214673R,91330503745844451F,91330000683124669E,91330723762512117A,91330000780477634Q,91330100757206158J,9133048172587440XX,913303007303249630,91330106563049270A,913300007245066803,91330000142944445H,91330000609671736A,913300007109591285,91430000796858896G,91330000742004828D,913300007308931541,91331000670275302R,91330100143956405Y,91331000255499827N,91330600609680368C,91330000704206103U,91330000704512063Y,9133000071257271X6,913310001481183122,91330683768696455K,913305007686509836,913306047498339794,913304216899850991,91330000609700859G,913300007315154541,9133000014871793XM,91330000609700795J,91331003148185689U,9133050066615054X7,91330109768216095R,91330481307761859N,91330000255072786B,91330000710969000C,913307837045812886,913301005865048038,91330000704277796X,91330800586274286A,91330400747729414J,913300001482332737,913300007664077600,91330000704676287N,913300007792873744,913301007517211839,913300001479306167,9133048114672516X7,91330000682900435M,9133070073033191X2,913300007368873961,913300001471207528,91330500763900410B,91330000609120272T,91330000741008835U,913300007463411432,91330100704290413D,913300001464759067,913306005547614262,913300007530435745,91330600753964306M,913300007829495191,913307007707246030,91330000723629566F,913306006096100281,913300007964528296,9133000025403311XB,91330000704503984N,91330000146684900A,913300006628918505,91330600146150140Y,91330000704676703E,91330000147115443M,91330000704204554C,91330000729092173R,913308025765293106,9133000074981708XL,9133000074100296XK,913300007047850454,91330600796466462H,913300001469757672,91330200704803223P,91330400X0942984X0,91330000704202137E,91330000142943469Q,9133060072360502XQ,91330500550515703T,91330211254100749G,91331023724526593X,91330000724502487G,91330000712560575G,91330000142927960N,913305007804719612,9133072670455890XG,91330681307478340A,9133000074200262XL,91330703781824255T,913310826683250245,91330000667124503L,915107007939595288,91330000726586776L,91330200698208670Y,91330000710987081Q,913304007352793803,91330000146183233T,91330500056855710M,913306837045254701,913300006096907427,913307007288998483,91330600145965997H,9133100070471153X3,91330000749843368W,913300001476445210,91330700147493495C,91330000779370442Q,91330102785327408E,91330281730145332E,913300007345233459,91330100665212665U,91330000704720655L,9133000077191496X7,91330400677231599U,913306006617396382,91330000607911599B,91330000720082446H,913310816100020466,91330000704207173A,9133110035546965XU,91330000717612987P,91330500704459485N,91330000704715960K,9133100075709503XC,91331000148144211K,91330523746336790G,91330100697072273U,91330300681666245N,913306247265987348,91330700566953812T,91330600779389434M,91330000755902563E,913304810852709304,913300007324065674,91330621755903566B,91330000754921594N,913300007043088475,913300002552164796,9133000074771866XW,91330326661705454E,91330000734522019A,91330000722765769J,91331000704716189P,91330000733811206X,913300007042034718,91330600704507918P,913310001480293875,91330621143010433H,913300001429192743,913306006628977937,913300001463546966,91330000MA2GAEH14D,91330000704721033H,91330683704477704R,91330301254496691M,91330600745085889D,91330000146150706G,9133000014616098X3,91330000146008822C,91330700776454800N,913303007793989040,913306216784286764,91330000704713738F,913308007639292214,91330700753962378R,91330000719525000X,913300006702752064,91330621721077606P,913311245623690963,913300007450544091,91330521147114918E,91330500753970802B,913300001429120051,91330000143906141R,91331082255225797Q,913300007200827022,9133000070459138X9,91330783552855277N,91330302585012778N,91331000782935301K,91330100751742531B,91330300145574611C,91330400559682372X,91330100735254191Q,91331000779358865H,91330600609661634M,91630000226882472D,91110000267130226N,91410100739082104K,91410100594879787D,91410100769490897P,91410100760248041Q,91321100718698874L,91321100761019494Q,91430000712108626U,91370000164960593R,91370300787160568U,913500007356527552,91420100177682019R,91441700197332374T,913700007357889006,913207007849886428,91350000158156419U,913500007279127050,91341300711774766H,913706007763110099,91320205704071771A,91440300754257216E,91411300742548454G,91340100743098352K,913100007514799050,91350000158150236X,91370000206951100B,914420007251062242,91500105759295238A,91520000214466447K,91220000123925847D,913301101438971341,91210000243490294P,911101086835621402,914405007224920787,91310000558762442G,91120113727536666U,91420000707079234K,91440300192420826L,91440400775088415F,914404007211055669,91440400799386302M,9144040077096114X2,914404001925952982,9144040019252393X0,91440400782988681C,9144040075452568XT,91440400665003767C,91440400192520640G,914404006183915766,9144040072510822XR,914300007808508659,914302007656224696,91430200616610317F,91430200799104619D,91430200776779744R,91430200712106524U,913703007347051654,91130300104366111M,914419006698651618,914300007121944054,913305007302929303,91442000618132806P,91330100X09172319F,91440101712408557U,913205006082844193,914401011904604026,91610000MA6TL0ET50,91440000725063471N,91330000747735638J,91510000207312079C,9110000010002371XD,911000000000192465,91100000100017045K,91100000100023728D,91100000102063897J,911100000000184548,91110000100000489L,9111000010000093XR,91110000100000948R,91110000100001035K,91110000100001043E,91110000100005220B,911100001000055386,91110000100005888C,9111000010001002XD,91110000100010433L,91110000100010660R,911100001000128855,91110000100014493P,91110000100017707H,91110000100018267J,911100001000238160,91110000100024915R,911100001011200015,9111000010169286X1,91110000102016548J,911100001055722912,9111000071092446XA,91110000710924910P,91110000710924929L,911100007109250324,91110000710925243K,911100007109279199,911100007109284282,91110000710929498G,91110000710929930X,911100007109303176,911100007109310534,91110000710931061Y,9111000071093107XN,911100007109311097,9111000071093123XX,91110000710932515R,91110000710935732K,911100007178306183,91110000717830650E,91120000103063434T,911200005534349336,91130000677356885K,9113010010459478X9,91130500784050822M,91133100MA0GBL5F38,91140000070450154H,91140000MA0L7G6D21,91140000MA0LAJ3H0K,91210000558190456G,912201011239989159,91310000128515986K,91310000132200821H,91310000132206289R,9131000032469570XM,91310000710924478P,91310000749599465B,913100007956724321,91310000MA1FL1MMXL,91310000MA1FL70B67,91310000MA1H32292H,91310115084107728R,91320000608946953Q,913205097344220935,91320509796141166A,913300001430490399,91330000734530895W,9133000079338631XM,91330109143586141L,91330300751176226P,913306811462584935,91340700151105774A,91350200154990617T,91350200260147498N,91360000158264065X,913700002671781071,913700006722499338,913716261672060009,91410000663414132K,914200001000115161,914201001776819133,91440101231235448R,91440101231247350J,91440101347466547J,9144010172502048XD,91440300746645251H,914403007675664218,91450000198229061H,91460000708866504F,915101006863154368,91510100MA68KHX35G,91510100MAACK35Q85,915300002919962735,91610000220568570K,916100007625687785,91620300224690952T,916501005991597627,91110000717843275N,913400001489746613,911200007109339563,914200007581510645,91350500717357365T,91330000143995391Q,914401017083429628,912301001275921118,91110000710931141J,91150000114392559E,9111010155140268X8,91150100114111403U,91150100743882956A,91150000797181303E,91150000MA0NLRJ076,9115010011412575X7,91150000793609990G,911500000957889956,91150100701462670A,91150000783023945D,9115010079018195X0,911500001141229700,91150000X270330054,91310107132936177G,911500007013056834,91150000MA0N4HYF5X,91150000067504705P,91150000MA0PQMHU5T,91150000MA0QG7U246,913101071337312962,91310109630360895R,913100006318655619,91310000132295701K,91310105132207185C,91310110133226572E,91310109051268760Y,913101101332266447,91310101132307282H,91310115067758342E,91310115749279533D,91310101132220662G,91310000769684149F,913101061322082417,91310000132291639W,91460200721278651T,91310118134373072F,913101098322500968,9131011313229600XR,91310109133065617J,91310113133405407T,91310000133139647N,91310101132504980J,913101171341092719,91310109631783195P,9131000073458050XB,91310120630941673A,91310104607218524U,91310000631319149J,91310000631757739E,91310000667805050M,913100005529432935,91310000132284295X,9131000013221297X9,913100003123156507,91310000MA1FL4Y718,913100007547623351,913100005601172662,91310000132201410X,91310000630245184Q,91310000132263849G,913100001322382488,91310101132214828F,913100006318635745,91310000132220312N,913100001322718147,9131000013221713XU,913100001322319278,91310000759006889A,913100001322128733,91310000132262168G,913100001322221746,91310000132228728T,913100006317558649,91310000MA1FL7ABX7,91310000MA7MGAGH5G,91310000MAC7AGQK2E,91310000MABXQWH71W,91310000132276535F,91310000MA1FL7QP6U,91310118MA1JPAB85C,91320000134795187R,913200001347595731,9132000078271658X0,91320000134767063W,91320000MA1P1ERM7T,91320000735724800G,9132000013478500X8,9132000072058717XP,91320000MA1YLUXD2W,91320000720587823R,913200001347771223,91320000134787937Q,913200001347507715,913200003238683144,91320000720585377G,9132000013475748XE,9132000077203354XH,91320000134850027D,913200003235715453,91320000MA2040GUXY,91320000MA20TXYX52,91330000142913112M,91330000758050706G,91330000671637379A,91330000798592788H,91330900307662068B,913300000683517554,913300007236299969,913300007276037692,91330000142917666G,91330000142911723D,913302005670431750,1233020075625385XG,9133020078676743XA,9133020014407480X5,91330200144055000M,913302007960219655,9133020025410298XE,9133020014409064XG,913302007900686311,91330200784303172P,91330000782926659M,91330000785683832W,91340000148943240Q,913406006775926850,91340300149861466W,913400000803136982,9134000072331410XA,913400007049015954,91340000MA2RYG6M22,91340000711778783B,9134000079644292X6,91340000148971532P,91340000705045276Y,91340000705044214B,913400001489739848,91340000148940701E,91340000MA2NKAX24Y,91340181153580560D,913406001508200390,91340400MA2RP38K42,913401001491402635,91340000670904113Y,9134000072632213X9,916100006847589897,91610700222542067H,91610000797924728K,91610301710086048M,91610000220527103Q,91610000MA6TG06P4D,91610000305718646D,91610000059672418N,9161000030577092X8,91610000052117366D,91610302713512620P,91610000220526151G,91610000MA6TG5H46J,91610000727343864B,9161000079078454X9,916100003054628635,91610000220535146F,91610000570663973E,91610000MA6TG43NXM,916100002205334589,91610000681587782E,91610000059670527U,91610000074509969A,91610000570668168K,916100005835106342,91310106780596246P,91610000220575738M,91610000220520630C,91610000056901668Y,91610113MAB0QTF62E,91610800MA70FUB2X9,91620300739622350L,916204005716275797,91620000720299995F,91620700670824180R,91620000MA72YXQ613,91620000MA748HK51R,9162000022433064XN,91620000438000013K,91620102719097851D,91620000712756631E,916200006654372581,91620000073568983Y,91620000MA73UCJ850,916200009245943712,91620000X24100305D,916200002248721900,916211227102756155,91620000745866570W,916200006654252818,916200007190339464,91620000719056611M,916200002244326626,916200002243386254,91620000719077033J,12620300438260237M,916202002246412029,91620105224526511D,9162010567083758X1,91620100224469959T,91620103224485561A,91620100296581077N,91620000MA72TBW70N,91620000224552058P,91620000MA74MFFBX6,91620000296584040E,91620000098238577P,91620000MA73J82M0G,91620000296623476P,91620000MA73WG5A4R,916328262275740092,91633300MA759FFA1W,91630000015000548J,916300007104085373,91630000226580757M,916300002265829634,916300007105860692,91633100564915871B,91630000710483537W,916300007105470788,91630000710404288D,91630000226586921N,91630000226591034H,9144030072619270XF,9163000071040638XJ,91640000694320542R,91640000MA75XH043G,91640000MA75XG8B9G,91640000710606357L,91640000MA75XJ7M0P,91640000227692945W,91640000MA75XJ33X8,91640000MACDDMB52H,91110108778635402E,9165042122906597XU,91650102228670318F,916501037817878108,91650000722328999R,91650100299945024F,91650100228603027Y,91650103228581884H,916501022286956386,91650000710883848L,91650100697822433D,91650100228713891B,916500007223141241,91650100MA77GLNN5M,91650000228593105A,91650100228580718E,91650102228670238U,91650103228695179F,91650000MA776A778F,9165010222858513XG,916500002286736663,91650000091941411H,9165010059916844X8,91650100599166567D,91650100666655871D,91650100076066313E,9165010022872948XD,916500007318392722,91650100228595231A,91650000MA79HAN55K,91650000MA7JYR3K9T,916500002285806033,91650109795790391X,91410000706780942L,914100006987322024,914100007982385511,91410000747444427A,91410300171076114N,91410000415800253F,91410000712649924H,91410000693505019R,914100005817422124,91410000698736553A,91410000MA9LE1618G,91410000699963723F,91410000MA9LNHTF1C,9114000075725677X0,91140000701002121R,911400001100144545,91140000MA0HL5WN2L,911400007460236201,91140000110053488C,91140100MA0H5Y7R8Q,91140000MA0HLAAE2A,911400006686150485,91140000112360000T,91140000678191736U,91140000110112812M,91140000110014497J,91140000110014112R,91140000694272341C,91149900MA0KYJFJ38,91149900MA0L1MLR6W,91140700MA0LB2T300,91152900MA0NBNHB9X,911529000539164509,91150802701437045J,91150800MA0MWLMGXC,911508027901707321,91150800MA0MWAC131,91150802720185680P,911508027012747412,91150204747901974B,9115020072010852XH,911502005581423315,91150200MA0N121B5U,91150200318506394X,91150204736100047A,911502003289900529,911502027830054053,91150400676905272F,91150400MA0MY24T05,911504021148077345,91150400MA0N0N7J0Q,91150400MA0N1KDY95,91150600MA0MYRY61P,911501007013703625,91150100MA0MYCFT85,91150100701339816T,91150100MA0Q6LBR1F,91150100783008323W,9115010008518731XJ,91150121MA0Q1P3N1B,91150100341438852Q,91150700MA0N3U2C7N,911507005788814577,9115070075255329X1,911507007761138545,911507001151975447,911505007332895859,911503003414432561,911503007012609990,91150302701260876U,91150302114670221P,91150900MA0PXXPT6X,91150900MA0PXPJD5M,91152500397354738G,91152500790165976M,91152200341286924J,91310118677856310T,914100007251292747,91330100662324231U,9111000066990444XF,91330000148868586D,91330000723628803R,91220000729540909F,91350200705487306K,91220000702425994U,91330901148716005T,91440300192255939T,91540000710905111C,91510000202285163Q,91451100711427393C,91140000267171001C,91420000722084584J,91340100754889192Q,91510100224367821D,9135010078216907X1,91610131556950212T,91320000100026961J,913300001460375783,914403007451740990,9133000071095874X3,914200007220290598,91440300665899831W,911400007460463205,91320211100013394P,91340000148975314D,914101007324826746,91220201124496079Q,9134020073498415XP,91441200725995439Y,914300007533850216,91310115729533231F,91370000163446410B,914103006148088992,91530000216521606P,9133000070471161XA,913710007060840744,91110000802062406U,91321200608812146K,91330200610257495J,91321000703903783L,91310000695826254X,91320506567813635P,91220101606092819L,91310000751873021H,91360106784146840K,91520000214433792D,91340100664238732X,91340500733034312N,915101845722876769,91510000725526042X,91320400747314251P,91370600746569906J,914303007483865809,91440000197576715Y,915100007422540773,91110000192472028J,91340121674200463H,91440101190484084A,91220101605902656F,91420112768092336G,911100007226014149,91330500720068476A,915203002147892034,914403001923528003,913501007438096369,914206001793145000,91130100236018805C,913200007455797746,911100006835529627,91440500723817938W,91230800127590757N,91340000704920454F,9137020071802356XK,91441900617994922G,911100007178710060,91510600205366604X,915200007366464537,913202007265601380,91330100762017394J,91330100742001328G,91330000758062811X,911101086723891430,913700007207576938,91430200712137961U,91110000710929148A,91310000134616599A,91320213061850324J,91450300708618439A,91330600739910598X,91110000795997288B,91110000192184333K,91441900618367138U,91330200610271537C,91310000631899761Q,914303007170467196,91445200077874291G,914400007250669553,91320600138299578A,91330000254847375U,91350200155052227K,91320200772038068L,91360500716575125F,91440300618888515F,91330000704277673W,91370000706385950B,91610103294241490X,913101156745626329,91310000133727203Q,91440400725466481C,911100002717519818,91110000100028633H,915001077500638601,915200006707225551,91440300618884987N,913202006079522354,913401006709173443,913702002646064362,913600007055083069,91340100348841353K,911200001030705897,91320000734417390D,913203001347934993,91320000249707722B,91310000132210544K,9135020076928783XA,9151060020515584XN,9132110075321015XF,913603007165007488,9132020072653508XD,91440300783905518J,913201921349556628,914102002681294387,91320300660802674Q,913302007342813661,91450800198227509B,9153000071947854XF,9111000010196866XH,9165000056438859XD,913300007743880298,9111000070024070XK,91320000743141824Y,913602007841010956,91500000202802570Y,91120000700492827M,91341100704965812G,913213001423289417,9132060072521804X6,91430500763263554A,91330000142943303A,91230200710935767F,91130300105390439K,911100008020705889,91310000607200164Q,913502007516215965,91220000123962584G,91310000745611834X,91420200714697006M,91110000100017336T,91370000168130028J,91310000133501183B,916100006611776206,913400001490341376,91442000617979677N,91410900744099904P,91430600186201870U,91610131710183542G,91210106243406830Q,91620000224371505Q,91110000710934537G,91350000158166297A,9151010075598305X1,913716007986665561,91320900743731816A,91150000720180740Y,913500001563372595,914503001991037270,9142010017767908XR,91330000704690900L,91320000134775688R,91441900673133772K,91410000706784652H,91320200578117344H,91610000745016111K,91110000710935257Q,915000002028133840,91610000220594875E,916200007202575254,91321200141076367Y,91340207664238230M,91110000700148065Y,914403007084294519,91360500674954556L,91370000267171810H,91510100201958223R,911100007109323200,914403001924223896,914403007917461234,91310000132212291W,91120000734546571Y,91370000706206553L,91410000733861107G,9131000013222900X9,9131000013221035XN,915100007758164357,913302007503813672,91610113628001682H,91440101100006899U,91341700MA2N8L8704,91130100700714215X,913100001322084958,91440800617803532R,91620000712759170Q,ZZSN23011300000009,91320000751254554N,91420000271756344P,91330000146884443G,91310000425011944Y,91130000673224391T,916103007099018935,91530000709829203J,91620000224336881T,91320500720523600H,914406007192139717,9134000070503581XA,91310000132203723P,91510000206956235C,91330000720085639T,91430000183898967C,9132000074557990XP,913700007254238017,91440101190517616D,913708007657630504,91410100170033534A,911410006838069266,91430000185034687R,91230199723661865E,91330000143839073P,91321100608834062C,91320000703971102J,911100007226144851,91330000710959275N,91330000142917121G,914401017397031187,914100001700014285,91340100610300772G,91330000712550473W,91370200163621493Q,9142000070689187XB,914500001982250954,914502001982303373,913200007194121453,91500000202819532B,9131000013220921X6,913100006316131618,914401830545413557,91620000224529093P,91371300672231450Y,91440500192983581M,91370000163099420E,91140000110055862W,91110000717825966F,914403002793464898,91120103103368983M,91310000710933112E,913100005791928139,91210200604862592R,91610000727342693Q,914404001925268319,91610000713550723T,91440300192238549B,91440300192241158P,913100006311887755,91350200158163213A,91310116662478847M,91450000715182397J,91440400628053925E,91110000625909986U,91310112792759719A,912101002434901556,916500002296811666,91340000719986552R,914300001837784984,911400001123599660,91350000158160688P,91650000132278661Y,91520000714303759X,91430000183774980R,913300007046976861,91310000132202296L,913700001630684138,91320000670145129U,91510000206152800A,91120000712830811X,91510100725369155J,916500002286626765,91320000714091899R,91440700193957385W,912200007911418611,91330100719572130W,911100007109338846,91330100143200149A,914403002793630194,91230100127046743W,911311007216760190,91650000712958321C,91340000730032602U,913700002671842400,91430300755843372T,91370000729270531X,9144020019153918XA,91310000132210595U,915000007093295592,911300007468680177,91130400106900891W,915111002069551289,91360600759986995D,913700007409658444,915400002196726375,91320500746203699Q,91350100739548277W,91370000588768482Y,914405002311310326,91513200211352460H,913417006836379072,91420000179120511T,91211200201909093K,91430300722573708K,91450200715187622B,91440000710924128L,91360000705515290C,91440101761932988M,91110000710935329H,913100001322300861,91140000725909617E,91430700183811016L,9133000014553840X5,9131000070327821X9,911100001000127624,91610131726285914J,91440400721169041N,91330100742929345R,91100000710932021X,9113020070071264XQ,91310000132207732J,91370000163098284E,913209001401417456,911100007109351850,91330000732023371N,916500009287328820,91450000791346584E,91330000720084441G,91440101633208952W,91370000164960403T,91410000170011642P,911100007334480727,91320000741339087U,9142010072466171X0,9144050061755920X4,91310000132234925P,914403001922545226,913300001461463526,91340000731686376P,914100007126456409,91650000710892189L,915300002919937260,913207007382577341,914403006188988448,91140000701012581E,9137068166139756X1,91610300220533749U,91320582MA1NU2QE9N,91320100738866409D,913403007199576633,91320200720584462Q,91330701254999838P,914203005654858771,91320100631402444M,913100006072212052,91510600205363163Y,91130200721620963C,91110000740091307R,913300007441437848,91330200144565596J,916400007749178406,913404007109235209,'
gn_result
=
aaa
.
split
(
','
)
gn_social_list
=
[
item
for
item
in
gn_result
]
print
(
'======='
)
#将数据插入到redis中
for
item
in
gn_social_list
:
r
.
rpush
(
'NewsEnterprise:gnqy_socialCode'
,
item
)
#
r.rpush('NewsEnterprise:gnqybc_socialCode', item)
#
r.rpush('NewsEnterprise:gnqy_socialCode', item)
r
.
rpush
(
'NewsEnterprise:gnqybc_socialCode'
,
item
)
# for item in gw_social_list:
# r.rpush('NewsEnterprise:gwqy_socialCode', item)
...
...
base/smart/smart_extractor.py
浏览文件 @
4e84d611
# -*- coding: utf-8 -*-
import
sys
import
pandas
as
pd
import
requests
from
goose3
import
Goose
from
goose3.text
import
StopWordsChinese
,
StopWordsKorean
,
StopWordsArabic
from
base.smart.entity
import
*
from
base.smart.smart_extractor_utility
import
SmartExtractorUtility
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base
\\
smart'
)
from
entity
import
*
from
smart_extractor_utility
import
SmartExtractorUtility
# goose3自带的lxml,提示找不到etree,但仍可使用
from
lxml
import
etree
from
lxml.html
import
HtmlElement
...
...
@@ -135,6 +138,16 @@ class SmartExtractor:
return
self
.
get_extraction_result
(
article
,
link_text
)
def
extract_by_html
(
self
,
html
,
link_text
=
''
):
"""
按HTML采集内容
"""
# 采集正文:传入html
article
=
self
.
goose
.
extract
(
raw_html
=
html
)
return
self
.
get_extraction_result
(
article
,
link_text
)
#url_list = [["搜狐新闻",'https://news.tianyancha.com/ll_uc76l7d774.html?gid=1499023','430418'],.....]
def
extract_by_url_test
(
url_list
,
list_info_all
):
# 测试:按URL采集
...
...
comData/Tyc/getTycId.py
浏览文件 @
4e84d611
# 根据信用代码获取天眼查id
import
json
import
random
import
sys
import
time
import
pymysql
import
requests
from
base.BaseCore
import
BaseCore
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
import
urllib3
urllib3
.
disable_warnings
(
urllib3
.
exceptions
.
InsecureRequestWarning
)
requests
.
adapters
.
DEFAULT_RETRIES
=
5
baseCore
=
BaseCore
()
baseCore
=
BaseCore
.
BaseCore
()
log
=
baseCore
.
getLogger
()
headers
=
{
'Accept'
:
'application/json, text/plain, */*'
,
...
...
comData/Tyc/newsbucai.py
浏览文件 @
4e84d611
...
...
@@ -6,11 +6,12 @@ import requests, time, pymysql
import
jieba
import
sys
from
bs4
import
BeautifulSoup
from
kafka
import
KafkaProducer
from
getTycId
import
getTycIdByXYDM
# from base.BaseCore import BaseCore
# from base.smart import smart_extractor
sys
.
path
.
append
(
'D:
\\
zzsn_spider
\\
base'
)
sys
.
path
.
append
(
'D:
\\
kkwork
\\
zzsn_spider
\\
base'
)
import
BaseCore
from
smart
import
smart_extractor
import
urllib3
...
...
@@ -51,6 +52,22 @@ cursor_ = baseCore.cursor
taskType
=
'企业动态/天眼查/补采20W+'
def
reqDetailmsg
(
url
,
headers
):
# proxy = {'https': 'http://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'}
for
i
in
range
(
0
,
1
):
try
:
response
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
8
,
verify
=
False
)
response
.
encoding
=
response
.
apparent_encoding
htmltext
=
response
.
text
except
Exception
as
e
:
htmltext
=
''
log
.
info
(
f
'{url}---详情请求失败--{e}'
)
if
htmltext
:
log
.
info
(
f
'{url}---详情请求成功'
)
break
return
htmltext
def
beinWork
(
tyc_code
,
social_code
,
start_time
):
time
.
sleep
(
3
)
...
...
@@ -171,13 +188,27 @@ def beinWork(tyc_code, social_code,start_time):
# 开始进行智能解析
# lang = baseCore.detect_language(title)
# smart = smart_extractor.SmartExtractor(lang)
#带标签正文
contentText
=
smart
.
extract_by_url
(
link
)
.
text
#不带标签正文
content
=
smart
.
extract_by_url
(
link
)
.
cleaned_text
# time.sleep(3)
# req = requests.get(url=link,headers=headers,timeout=10)
# html = BeautifulSoup(req.content,'html.parser')
raw_html
=
reqDetailmsg
(
link
,
headers
)
if
raw_html
:
# soup = BeautifulSoup(raw_html, 'html.parser')
try
:
article
=
smart
.
extract_by_html
(
raw_html
)
content
=
article
.
cleaned_text
contentText
=
article
.
text
except
Exception
as
e
:
log
.
info
(
f
'抽取失败!!{e}'
)
# #带标签正文
# contentText = smart.extract_by_url(link).text
# #不带标签正文
# content = smart.extract_by_url(link).cleaned_text
# # time.sleep(3)
except
Exception
as
e
:
contentText
=
''
if
contentText
==
''
:
log
.
error
(
f
'获取正文失败:--------{tyc_code}--------{num}--------{link}'
)
e
=
'获取正文失败'
...
...
@@ -281,7 +312,7 @@ def doJob():
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
social_code
=
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
#
social_code = '91440300665899831W
'
#
social_code = '913205007764477744
'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
...
...
comData/annualReport/雪球网-年报.py
浏览文件 @
4e84d611
# -*-
coding: utf-8 -*-
# -*-
coding: utf-8 -*-
...
...
@@ -213,7 +213,7 @@ def spider_annual_report(dict_info,num):
'sid'
:
'1684032033495392257'
,
'sourceAddress'
:
year_url
,
# 原文链接
'summary'
:
''
,
'title'
:
name_pdf
.
replace
(
'
,
pdf'
,
''
),
'title'
:
name_pdf
.
replace
(
'
.
pdf'
,
''
),
'type'
:
1
,
'socialCreditCode'
:
social_code
,
'year'
:
year
...
...
@@ -260,7 +260,7 @@ if __name__ == '__main__':
start_time
=
time
.
time
()
# 获取企业信息
# social_code = baseCore.redicPullData('AnnualEnterprise:gnshqy_socialCode')
social_code
=
'913
412007050444417
'
social_code
=
'913
30000734507783B
'
if
not
social_code
:
time
.
sleep
(
20
)
continue
...
...
comData/negative_news/creditchina.py
浏览文件 @
4e84d611
...
...
@@ -33,13 +33,14 @@ def getRequest(url,headers):
return
json_data
# 严重失信
def
dishonesty
():
def
dishonesty
(
headers
,
com_name
,
social_code
):
list_dishonesty
=
[]
param
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
...
...
@@ -50,14 +51,14 @@ def dishonesty():
if
json_data
[
'status'
]
==
1
:
pass
total_size
=
json_data
[
'data'
][
'totalSize'
]
for
page
in
total_size
:
for
page
in
range
(
1
,
total_size
+
1
)
:
param_page
=
{
'tableName'
:
'credit_zgf_fr_sxbzxr'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'page'
:
f
'{page}'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_zgf_fr_sxbzxr&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
...
@@ -67,7 +68,7 @@ def dishonesty():
pass
info_list
=
json_data
[
'data'
][
'list'
]
for
info
in
info_list
:
entity
=
info
[
'entity'
]
entity
=
info
iname
=
entity
[
'iname'
]
# 失信被执行人姓名/名称
cardnumber
=
entity
[
'cardnumber'
]
# 组织机构代码
court_name
=
entity
[
'court_name'
]
# 执行法院
...
...
@@ -83,15 +84,34 @@ def dishonesty():
performed_part
=
entity
[
'performed_part'
]
# 已履行部分
unperform_part
=
entity
[
'unperform_part'
]
# 未履行部分
dataSource
=
info
[
'dataSource'
]
# 数据来源
dic_dishonesty
=
{
'失信被执行人姓名/名称'
:
iname
,
'组织机构代码'
:
cardnumber
,
'执行法院'
:
court_name
,
'省份'
:
area_name
,
'执行依据文号'
:
case_code
,
'立案时间'
:
reg_date
,
'案号'
:
gist_cid
,
'做出执行依据单位'
:
gist_unit
,
'生效法律文书确定的义务'
:
duty
,
'被执行人的履行情况'
:
performance
,
'失信被执行人行为具体情形'
:
disreput_type_name
,
'发布时间'
:
publish_date
,
'已履行部分'
:
performed_part
,
'未履行部分'
:
unperform_part
,
'数据来源'
:
dataSource
}
list_dishonesty
.
append
(
dic_dishonesty
)
return
list_dishonesty
# 行政处罚
def
punish
():
def
punish
(
headers
,
com_name
,
social_code
):
list_punish
=
[]
param
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
...
...
@@ -106,15 +126,16 @@ def punish():
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
log
.
info
(
f
'该企业{com_name}无行政处罚信息'
)
return
list_punish
for
page
in
range
(
1
,
total_size
+
1
):
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
'雷州市白金银座演艺文化实业有限公司'
,
'tyshxydm'
:
'91440882315032592M'
,
'page'
:
f
'{page}'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url_page
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_xyzx_fr_xzcf_new&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
...
...
@@ -141,6 +162,88 @@ def punish():
cf_sjly
=
entity
[
'cf_sjly'
]
# 数据来源
cf_sjlydm
=
entity
[
'cf_sjlydm'
]
# 数据来源单位统一社会信用代码
dic_punish
=
{
'行政处罚决定书文号'
:
cf_wsh
,
'处罚类别'
:
cf_cflb
,
'处罚决定日期'
:
cf_jdrq
,
'处罚内容'
:
cf_nr
,
'罚款金额(万元)'
:
cf_nr_fk
,
'没收违法所得、没收非法财物的金额(万元)'
:
cf_nr_wfff
,
'暂扣或吊销证照名称及编号'
:
cf_nr_zkdx
,
'违法行为类型'
:
cf_wfxw
,
'违法事实'
:
cf_sy
,
'处罚依据'
:
cf_yj
,
'处罚机关'
:
cf_cfjg
,
'处罚机关统一社会信用代码'
:
cf_cfjgdm
,
'数据来源'
:
cf_sjly
,
'数据来源单位统一社会信用代码'
:
cf_sjlydm
}
list_punish
.
append
(
dic_punish
)
return
list_punish
# 经营异常
def
abnormal
(
headers
,
com_name
,
social_code
):
list_abhormal
=
[]
param
=
{
'tableName'
:
'credit_scjdglzj_fr_ycjyml'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
'1'
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param["keyword"]}&tyshxydm={param["tyshxydm"]}&page=1&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
# print(json_data)
if
json_data
[
'status'
]
==
1
:
pass
# 总条数
total_size
=
json_data
[
'data'
][
'totalSize'
]
if
total_size
>
0
:
pass
else
:
log
.
info
()
for
page
in
total_size
:
param_page
=
{
'tableName'
:
'credit_xyzx_fr_xzcf_new'
,
'searchState'
:
'1'
,
'scenes'
:
'defaultscenario'
,
'keyword'
:
com_name
,
'tyshxydm'
:
social_code
,
'page'
:
page
,
'pageSize'
:
'10'
}
url
=
f
'https://public.creditchina.gov.cn/private-api/catalogSearch?tableName=credit_scjdglzj_fr_ycjyml&searchState=1&scenes=defaultscenario&keyword={param_page["keyword"]}&tyshxydm={param_page["tyshxydm"]}&page={param_page["page"]}&pageSize=10'
json_data
=
getRequest
(
url
,
headers
)
if
json_data
[
'status'
]
==
1
:
pass
info_list
=
json_data
[
'data'
][
'list'
]
for
entity
in
info_list
:
entname
=
entity
[
'entname'
]
# 企业名称
uniscid
=
entity
[
'uniscid'
]
# 社会统一信用代码
lerep
=
entity
[
'lerep'
]
# 法定代表人
pripid
=
entity
[
'pripid'
]
# 主体身份代码
regno
=
entity
[
'regno'
]
# 注册号
specausename
=
entity
[
'specausename'
]
# 列入经营异常名录原因类型名称
abntime
=
entity
[
'abntime'
]
# 设立日期
decorgname
=
entity
[
'decorgname'
]
# 列入决定机关名称
dataSource
=
entity
[
'dataSource'
]
# 数据来源
dic_abnormal
=
{
'企业名称'
:
entname
,
'社会统一信用代码'
:
uniscid
,
'法定代表人'
:
lerep
,
'主体身份代码'
:
pripid
,
'注册号'
:
regno
,
'列入经营异常名录原因类型名称'
:
specausename
,
'设立日期'
:
abntime
,
'列入决定机关名称'
:
decorgname
,
'数据来源'
:
dataSource
}
list_abhormal
.
append
(
dic_abnormal
)
return
list_abhormal
if
__name__
==
'__main__'
:
...
...
@@ -154,16 +257,18 @@ if __name__=='__main__':
'sec-ch-ua-mobile'
:
'?0'
,
'sec-ch-ua-platform'
:
'"Windows"'
}
type_list
=
[
'严重失信主体名单'
,
'行政管理'
]
com_name
=
''
social_code
=
''
dishonesty
()
punish
()
com_name
=
'石家庄交投集团工程服务有限责任公司'
social_code
=
'91130100MA7EK14C8L'
# list_dishonesty = dishonesty(headers,com_name,social_code)
# print(list_dishonesty)
list_punish
=
punish
(
headers
,
com_name
,
social_code
)
print
(
list_punish
)
# abnormal(headers,com_name,social_code)
# 报告链接
url_report
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
report_json
=
getRequest
(
url_report
,
headers
)
reportNumber
=
report_json
[
'data'
][
'reportNumber'
]
pdf_url
=
f
'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
#
url_report = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownload?companyName={com_name}&entityType=1&uuid=&tyshxydm={social_code}'
#
report_json = getRequest(url_report, headers)
#
reportNumber = report_json['data']['reportNumber']
#
pdf_url = f'https://public.creditchina.gov.cn/credit-check/pdf/clickDownloadOBS?reportNumber={reportNumber}'
# respon = requests.get(url=pdf_url,headers=headers,verify=False,timeout=30)
...
...
comData/tyctest/tycdt.py
浏览文件 @
4e84d611
...
...
@@ -58,8 +58,8 @@ class Tycdt(object):
def
doJob
(
self
):
while
True
:
# 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
#
social_code = self.baseCore.redicPullData('NewsEnterprise:gnqybc_socialCode')
social_code
=
'913205002517479347'
social_code
=
self
.
baseCore
.
redicPullData
(
'NewsEnterprise:gnqybc_socialCode'
)
#
social_code = '913205002517479347'
# 判断 如果Redis中已经没有数据,则等待
if
social_code
==
None
:
time
.
sleep
(
20
)
...
...
comData/weixin_solo/get_tokenCookies.py
浏览文件 @
4e84d611
...
...
@@ -50,7 +50,7 @@ if __name__=="__main__":
opt
.
add_experimental_option
(
"excludeSwitches"
,
[
"enable-automation"
])
opt
.
add_experimental_option
(
'excludeSwitches'
,
[
'enable-logging'
])
opt
.
add_experimental_option
(
'useAutomationExtension'
,
False
)
opt
.
binary_location
=
r'D:\
crawler\baidu_crawler\tool\
Google\Chrome\Application\chrome.exe'
opt
.
binary_location
=
r'D:\Google\Chrome\Application\chrome.exe'
chromedriver
=
r'D:\cmd100\chromedriver.exe'
browser
=
webdriver
.
Chrome
(
chrome_options
=
opt
,
executable_path
=
chromedriver
)
url
=
"https://mp.weixin.qq.com/"
...
...
qiushi_leaderspeech.py
0 → 100644
浏览文件 @
4e84d611
import
datetime
import
json
import
time
import
redis
import
requests
from
bs4
import
BeautifulSoup
from
urllib.parse
import
urljoin
from
kafka
import
KafkaProducer
from
base.BaseCore
import
BaseCore
baseCore
=
BaseCore
()
log
=
baseCore
.
getLogger
()
r
=
redis
.
Redis
(
host
=
"114.115.236.206"
,
port
=
6379
,
password
=
'clbzzsn'
,
db
=
0
)
def
sendKafka
(
dic_news
):
try
:
producer
=
KafkaProducer
(
bootstrap_servers
=
[
'114.115.159.144:9092'
],
max_request_size
=
1024
*
1024
*
20
)
kafka_result
=
producer
.
send
(
"crawlerInfo"
,
json
.
dumps
(
dic_news
,
ensure_ascii
=
False
)
.
encode
(
'utf8'
))
print
(
kafka_result
.
get
(
timeout
=
10
))
dic_result
=
{
'success'
:
'ture'
,
'message'
:
'操作成功'
,
'code'
:
'200'
,
}
log
.
info
(
dic_result
)
return
True
except
Exception
as
e
:
dic_result
=
{
'success'
:
'false'
,
'message'
:
'操作失败'
,
'code'
:
'204'
,
'e'
:
e
}
log
.
info
(
dic_result
)
return
False
def
getRequest
(
url
,
headers
):
req
=
requests
.
get
(
url
=
url
,
headers
=
headers
,
timeout
=
30
)
if
req
.
status_code
==
200
:
pass
soup
=
BeautifulSoup
(
req
.
content
,
'html.parser'
)
return
soup
def
deletep
(
soup
,
attribute_to_delete
,
value_to_delete
):
# 查找带有指定属性的P标签并删除
p_tags
=
soup
.
find_all
(
'p'
,
{
attribute_to_delete
:
value_to_delete
})
for
p_tag
in
p_tags
:
p_tag
.
decompose
()
def
deletek
(
soup
):
# 删除空白标签(例如<p></p>、<p><br></p>, img、video、hr除外)
for
i
in
soup
.
find_all
(
lambda
tag
:
len
(
tag
.
get_text
())
==
0
and
tag
.
name
not
in
[
"img"
,
"video"
,
"br"
]
and
tag
.
name
!=
"br"
or
tag
.
get_text
()
==
' '
):
for
j
in
i
.
descendants
:
if
j
.
name
in
[
"img"
,
"video"
,
"br"
]:
break
else
:
i
.
decompose
()
# 将html中的相对地址转换成绝对地址
def
paserUrl
(
html
,
listurl
):
# 获取所有的<a>标签和<img>标签
if
isinstance
(
html
,
str
):
html
=
BeautifulSoup
(
html
,
'html.parser'
)
links
=
html
.
find_all
([
'a'
,
'img'
])
# 遍历标签,将相对地址转换为绝对地址
for
link
in
links
:
if
'href'
in
link
.
attrs
:
link
[
'href'
]
=
urljoin
(
listurl
,
link
[
'href'
])
elif
'src'
in
link
.
attrs
:
link
[
'src'
]
=
urljoin
(
listurl
,
link
[
'src'
])
return
html
if
__name__
==
'__main__'
:
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
'Accept-Encoding'
:
'gzip, deflate'
,
'Accept-Language'
:
'zh-CN,zh;q=0.9'
,
'Cache-Control'
:
'max-age=0'
,
'Cookie'
:
'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid
%3
D744929620-1698112534-
%26
ntime
%3
D1698112562; wdlast=1698112562'
,
'Host'
:
'www.qstheory.cn'
,
'Proxy-Connection'
:
'keep-alive'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
url
=
'http://www.qstheory.cn/qs/mulu.htm'
soup_report
=
getRequest
(
url
,
headers
)
report_list
=
soup_report
.
find_all
(
'div'
,
class_
=
'col-sm-3'
)
for
book
in
report_list
:
href
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)[
'href'
]
year
=
book
.
find
(
'div'
,
class_
=
'booktitle'
)
.
find
(
'a'
)
.
text
soup_href
=
getRequest
(
href
,
headers
)
period
=
soup_href
.
find
(
'div'
,
class_
=
'highlight'
)
deletep
(
period
,
'align'
,
'center'
)
deletek
(
period
)
period_list
=
period
.
find_all
(
'p'
)
for
p
in
period_list
:
period_href
=
p
.
find
(
'a'
)[
'href'
]
period_title
=
p
.
find
(
'a'
)
.
text
soup_news
=
getRequest
(
period_href
,
headers
)
deletep
(
soup_news
,
'align'
,
'center'
)
deletek
(
soup_news
)
title_list
=
soup_news
.
select
(
'div[class="highlight"]>p'
)[
1
:]
for
new
in
title_list
:
try
:
deletek
(
new
)
try
:
author
=
new
.
find
(
'font'
,
face
=
'楷体'
)
.
text
.
replace
(
'/'
,
''
)
.
replace
(
'
\u3000
'
,
' '
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
if
len
(
author
)
>
4
:
continue
# if '(' in author or '本刊' in author or '国家' in author\
# or '中共' in author or '记者' in author or '新闻社' in author\
# or '党委' in author or '调研组' in author or '研究中心' in author\
# or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
if
'('
in
author
or
'本刊'
in
author
or
'国家'
in
author
\
or
'中共'
in
author
or
'记者'
in
author
or
'新闻社'
in
author
\
or
'党委'
in
author
or
'”'
in
author
\
or
'大学'
in
author
or
'洛桑江村'
in
author
:
continue
new_href
=
new
.
find
(
'a'
)[
'href'
]
is_member
=
r
.
sismember
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
if
is_member
:
continue
new_title
=
new
.
find
(
'a'
)
.
text
.
replace
(
'
\u3000
'
,
' '
)
.
lstrip
(
' '
)
.
replace
(
'——'
,
''
)
.
replace
(
'
\xa0
'
,
''
)
except
:
continue
soup_new
=
getRequest
(
new_href
,
headers
)
deletek
(
soup_new
)
deletep
(
soup_new
,
'style'
,
'TEXT-ALIGN: center'
)
result
=
soup_new
.
find
(
'div'
,
class_
=
'inner'
)
if
result
:
pass
else
:
continue
span_list
=
result
.
find_all
(
'span'
)
source
=
span_list
[
0
]
.
text
.
replace
(
'来源:'
,
''
)
.
strip
(
'
\r\n
'
)
pub_time
=
span_list
[
2
]
.
text
.
strip
(
'
\r\n
'
)
content
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
.
text
paserUrl
(
soup_new
,
new_href
)
contentWithTag
=
soup_new
.
find
(
'div'
,
class_
=
'highlight'
)
nowDate
=
datetime
.
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
dic_news
=
{
'sid'
:
'1716996740019585025'
,
'title'
:
new_title
,
'source'
:
"16"
,
'origin'
:
source
,
'author'
:
author
,
'publishDate'
:
pub_time
,
'content'
:
content
,
'contentWithTag'
:
str
(
contentWithTag
),
'sourceAddress'
:
new_href
,
"createDate"
:
nowDate
}
# log.info(dic_news)
if
sendKafka
(
dic_news
):
r
.
sadd
(
'qiushileaderspeech::'
+
period_title
,
new_href
)
log
.
info
(
f
'采集成功----{dic_news["sourceAddress"]}'
)
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论