提交 5880af84 作者: liuweigang

微信修改项目提交

上级 5131584c
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile default="true" name="Default" enabled="true" />
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="crawler_2022" />
<module name="weixinCrawler" />
</profile>
</annotationProcessing>
</component>
<component name="JavacSettings">
<option name="ADDITIONAL_OPTIONS_OVERRIDE">
<module name="weixinCrawler" options="-parameters" />
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" defaultCharsetForPropertiesFiles="UTF-8">
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/awx/controller/WeixinController.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/awx/service/SiteService.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/entity/SiteMsgRecord.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/test/Test.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/util/WeixinUtil.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/resources/constants.properties" charset="UTF-8" />
<file url="PROJECT" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="http://maven.aliyun.com/nexus/content/groups/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<component name="libraryTable">
<library name="mysql-connector-java-5.1.7-bin">
<CLASSES>
<root url="jar://$PROJECT_DIR$/weixinCrawler/lib/mysql-connector-java-5.1.7-bin.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
<option value="$PROJECT_DIR$/weixinCrawler/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="ProjectType">
<option name="id" value="jpab" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RunConfigurationProducerService">
<option name="ignoredProducers">
<set>
<option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
</set>
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zzsn</groupId>
<artifactId>crawler_2022</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>war</packaging>
<name>crawler_2022</name>
<!-- FIXME change it to the project's website -->
<url>http://www.zzsn.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<!-- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>-->
</dependencies>
<build>
<finalName>crawler_2022</finalName>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_war_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-war-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar
微信公众号爬取实现原理:
1.配置公众号对应的信息
2.使用代码与企业微信建立连接,并将对应的公众号链接通过代码发送到对应的微信号上。
3.使用自动化软件触发手机微信接收消息点击访问的步骤,使用电脑fiddler抓取对应的页面信息。
4.使用代码从页面中提取微信公众号中的信息链接。
5.对抽取到的信息链接进行访问和信息抽取。
微信爬虫流程:
1.从kafka获取微信公众号信息。
2.从链接中获取微信公众号id。
3.使用代码模拟企业微信并将信息发送给相关的微信号。
4.手机微信号接收到公众号的链接信息。
5.使用免root自动化助手创建自动化脚本模拟点击手机微信消息操作。
6.使用fiddler抓取微信点击微信公众号的页面,并通过fiddler的脚本将抓取的页面信息发送给程序代码,
7.代码实现对页面信息的抽取解析获取对应的资讯链接地址,并发送到kafka中。
8.接收kafka中的资讯链接地址并对相关的内容信息进行抽取,再发送到相应的kafka的topic中
<?xml version="1.0" encoding="UTF-8"?>
<web-ext-pme
xmlns="http://websphere.ibm.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://websphere.ibm.com/xml/ns/javaee http://websphere.ibm.com/xml/ns/javaee/ibm-web-ext-pme_1_0.xsd"
version="1.0">
</web-ext-pme>
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM https://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Maven Start Up Batch script
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM M2_HOME - location of maven2's installed home dir
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM set title of command window
title %0
@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %*
if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %*
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B
)
@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
if exist %WRAPPER_JAR% (
if "%MVNW_VERBOSE%" == "true" (
echo Found %WRAPPER_JAR%
)
) else (
if not "%MVNW_REPOURL%" == "" (
SET DOWNLOAD_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
)
if "%MVNW_VERBOSE%" == "true" (
echo Couldn't find %WRAPPER_JAR%, downloading it ...
echo Downloading from: %DOWNLOAD_URL%
)
powershell -Command "&{"^
"$webclient = new-object System.Net.WebClient;"^
"if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
"$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
"}"^
"[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^
"}"
if "%MVNW_VERBOSE%" == "true" (
echo Finished downloading %WRAPPER_JAR%
)
)
@REM End of extension
@REM Provide a "standardized" way to retrieve the CLI args that will
@REM work with both Windows and non-Windows executions.
set MAVEN_CMD_LINE_ARGS=%*
%MAVEN_JAVA_EXE% ^
%JVM_CONFIG_MAVEN_PROPS% ^
%MAVEN_OPTS% ^
%MAVEN_DEBUG_OPTS% ^
-classpath %WRAPPER_JAR% ^
"-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^
%WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat"
if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%"=="on" pause
if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE%
cmd /C exit /B %ERROR_CODE%
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.9.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.zzsn</groupId>
<artifactId>weixinCrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>weixinCrawler</name>
<description>weixinCrawler</description>
<properties>
<failOnMissingWebXml>false</failOnMissingWebXml>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>jackson-all</groupId>
<artifactId>jackson-all</artifactId>
<version>1.7.6</version>
<scope>system</scope>
<systemPath>${pom.basedir}/lib/jackson-all-1.7.6.jar</systemPath>
</dependency>
<dependency>
<groupId>jedis</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
<scope>system</scope>
<systemPath>${pom.basedir}/lib/jedis-3.0.1.jar</systemPath>
</dependency>
<!-- xml解析 -->
<dependency>
<groupId>jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
</dependency>
<!---->
<dependency>
<groupId>io.protostuff</groupId>
<artifactId>protostuff-core</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>io.protostuff</groupId>
<artifactId>protostuff-runtime</artifactId>
<version>1.6.0</version>
</dependency>
<!-- http 工具 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>com.burgstaller</groupId>
<artifactId>okhttp-digest</artifactId>
<version>1.15</version>
</dependency>
<!-- mybatis-plus -->
<!--<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.1</version>
</dependency>-->
<!-- 数据库连接池 -->
<!--<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>-->
<!--redis依赖-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.13</version>
</dependency>
<!-- kafka依赖添加 -->
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
<!-- <version>2.1.0.RELEASE</version>-->
</dependency>
<!--xpath解析-->
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.2</version>
</dependency>
<dependency>
<groupId>com.whalin</groupId>
<artifactId>Memcached-Java-Client</artifactId>
<version>3.0.2</version>
</dependency>
<dependency>
<groupId>net.spy</groupId>
<artifactId>spymemcached</artifactId>
<version>2.12.2</version>
</dependency>
<dependency>
<groupId>com.googlecode.xmemcached</groupId>
<artifactId>xmemcached</artifactId>
<version>2.4.7</version>
</dependency>
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.5.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<!-- spring定时任务 -->
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.2.1</version>
</dependency>
<!-- 该依赖必加,里面有sping对schedule的支持 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.3.8</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.zzsn.WeixinCrawlerApplication</mainClass>
<includeSystemScope>true</includeSystemScope><!--外部进行打包-->
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package com.zzsn;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
//@SpringBootApplication
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class WeixinCrawlerApplication extends SpringBootServletInitializer {
@Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(WeixinCrawlerApplication.class);
}
public static void main(String[] args) {
SpringApplication.run(WeixinCrawlerApplication.class, args);
}
}
package com.zzsn.awx.controller;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSON;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.WeixinDetailThread;
import com.zzsn.entity.SiteMsgRecord;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.entity.Wxurl;
import com.zzsn.job.JedisUtil;
import com.zzsn.awx.service.ApiService;
import com.zzsn.awx.service.SiteService;
import com.zzsn.util.Constants;
import com.zzsn.util.WeixinUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 后台跳转控制类
* 创建人:李东亮
* 创建时间:2015-5-7 下午6:52:37
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Slf4j
@Controller
@RequestMapping("/wxt")
public class WeixinController {
// http://localhost:8079/wxt/dofiddlerback?wxurl=1
@RequestMapping(value ="/test", method = RequestMethod.GET)
@ResponseBody
public String test(){
return "hello!";
}
@RequestMapping("dofiddlerback")
public @ResponseBody String doFiddlerback(String wxurl,String weixinxml) throws Exception
{
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
// System.out.println(wxurl);
// System.out.println(weixinxml);
String patt = "http:.{200,300}#wechat_redirect";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(weixinxml);
int count=0;
Date collectTime=null;
String infoSourceId="";
while (m.find()) {
String weixinurl=m.group(0).replaceAll("\\\\","").replaceAll("amp;","");
//判断是否爬取
// String keyanv= JedisUtil.getString(weixinurl);
// if(StringUtils.isEmpty(keyanv)){
SiteMsgTemple site=new SiteMsgTemple();
site.setSiteUri(weixinurl);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(site);
// kafkaTemplate.send(Constants.KAFKA_WXDETAILURL_TOPIC, "key", docjson);
WeixinDetailThread weixinDetailThread=new WeixinDetailThread();
String weixinid=weixinDetailThread.getParam(weixinurl);
String siteStr = JedisUtil.getString(":"+weixinid);
SiteMsgTemple siteMsgTemple = JSON.parseObject(siteStr, SiteMsgTemple.class);
siteMsgTemple.setSiteUri(weixinurl);
weixinDetailThread.siteMsgTemple=siteMsgTemple; //如何取到对应的微信公众号信息
collectTime= DateTime.now();
infoSourceId=siteMsgTemple.getId();
//根据链接请求下载资讯并发送到kafka
boolean flag = weixinDetailThread.detailCrawler();
if(flag) {
count++;
}
log.info("提取公众号详情信息url:"+weixinurl);
// }else{
// log.info("公众号详情url已爬取:"+weixinurl);
// }
}
if(count>0){
ObjectMapper mapper = new ObjectMapper();
try {
SiteMsgRecord siteMsgRecord =new SiteMsgRecord();
siteMsgRecord.setInfoSourceId(infoSourceId);
siteMsgRecord.setNum(count);
siteMsgRecord.setSource("1");
siteMsgRecord.setCollectTime(collectTime);
String docjson = mapper.writeValueAsString(siteMsgRecord);
System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
} catch (JsonProcessingException e) {
// e.printStackTrace();
log.info("发送到kafka失败。");
}
}
return "ok";
}
@RequestMapping("dofiddlerbackurl")
public @ResponseBody String doFiddlerbackforurl(String wxurl,String oldurl) throws Exception
{
System.out.println(wxurl);
System.out.println(oldurl);
String keyid=SiteService.getParambyname(oldurl, "signature");
String wxid=JedisUtil.getString(keyid);
String wxidflag=JedisUtil.getString("apiflag"+keyid);
if(StringUtils.isNotEmpty(wxid)&&StringUtils.isNotEmpty(wxidflag)&&StringUtils.isNotEmpty(wxurl)) {
Wxurl wxurl1=new Wxurl();
wxurl1.setId(Long.valueOf(wxid));
wxurl1.setNurl(wxurl);
JedisUtil.setString(oldurl,wxurl,0);
}
return "123";
}
@RequestMapping("dofiddlerbackurl111")
public @ResponseBody String doFiddlerbackforurl111(String wxurl,String oldurl) throws Exception
{
System.out.println(wxurl);
System.out.println(oldurl);
String keyid=SiteService.getParambyname(oldurl, "signature");
String wxid=JedisUtil.getString(keyid);
// siteService.updateBasedata(wxid, wxurl);
return "123";
}
public static void main(String[] args) {
String patt = "http:.{200,300}#wechat_redirect";
String html="etime&quot;:1562742531,&quot;fakeid&quot;:&quot;3008169006&quot;,&quot;status&quot;:2,&quot;content&quot;:&quot;&quot;},&quot;app_msg_ext_info&quot;:{&quot;title&quot;:&quot;孩子,爸爸没有双手,依然可以抱你长大!&quot;,&quot;digest&quot;:&quot;一场意外事故,让他在13岁时失去了双臂。如今他照顾着九个月大的儿子。&quot;,&quot;content&quot;:&quot;&quot;,&quot;fileid&quot;:504808950,&quot;content_url&quot;:&quot;http:\\/\\/mp.weixin.qq.com\\/s?__biz=MzAwODE2OTAwNg==&amp;amp;mid=2652292603&amp;amp;idx=1&amp;amp;sn=7f49e032fab2b1df15ca01a4add8e968&amp;amp;chksm=809096bab7e71fac1bec47b17d37314c0948dd1ed61aea09abaa823f498f17e8511ce5bb08f2&amp;amp;scene=27#wechat_redirect&quot;,&quot;source_url&quot;:&quot;https:\\/\\/sina.cn\\/&quot;,&quot;cover&quot;:&quot;http:\\/\\/mmbiz.qpic.cn\\/mmbiz_jpg\\/x6iaHWKibUzk0J1dOiccqkceSyM6n6SngTicJyaUo7N7zGAtz7pzJOZ8PibUyibgEvQKIWzV5I0yLAiaeHPrLTEU9nW6A\\/0?wx_fmt=jpeg&quot;,&quot;subtype&quot;:9,&quot;is_multi&quot;:1,&quot;multi_app_msg_item_list&quot;:[{&quot;title&quot;:&quot;捉谣记&nbsp;|&nbsp;官方公布酒驾玛莎拉蒂女车主有间歇性精神病?消息不靠谱&quot;,&quot;digest&quot;:&quot;目前并没有任何在调查和处置该案的“官方”公布酒驾女车主有间歇性精神病。&quot;,&quot;content&quot;:&quot;&quot;,&quot;fileid&quot;:504808954,&quot;content_url&quot;:&quot;http:\\/\\/mp.weixin.qq.com\\/s?__biz=MzAwODE2OTAwNg==&amp;amp;mid=2652292603&amp;amp;idx=2&amp;amp;sn=ef7a35347bc9dceb79c0fa42f8840f79&amp;amp;chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&amp;amp;scene=27#wechat_redirect&quot;,&quot;source_url&quot;:&quot;https:\\/\\/sina.cn\\/&quot;,&quot;cover&quot;:&quot;http:\\/\\/mmbiz.qpic.cn\\/mmbiz_jpg\\/x6iaHWKibUzk0J1dOiccqkceSyM6n6SngTicPeCnYPoIzGSvXISGLQFdRIkV0ZrOODuczz0bxCFXXqfTIv6IOqVPtg\\/0?";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(html);
while (m.find()) {
System.out.println(1);
System.out.println(m.group(0));
System.out.println(m.group(0).replaceAll("\\\\","").replaceAll("amp;",""));
System.out.println(m.group(0).length());
}
WeixinUtil.sendWxMessage("lwg2468741258", "点击链接:"+"http://mp.weixin.qq.com/s?__biz=MzAwODE2OTAwNg==&mid=2652292603&idx=2&sn=ef7a35347bc9dceb79c0fa42f8840f79&chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&scene=27#wechat_redirect", 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
}
@RequestMapping("dofiddlerbackapp")
public @ResponseBody String doFiddlerbackapp(String wxurl,String weixinxml) throws Exception
{
System.out.println(wxurl);
System.out.println(weixinxml);
String patt = "http://m.toutiao.com.{10,30}\"";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(weixinxml);
while (m.find()) {
String weixinurl=m.group(0).replaceAll("\\\\","").replaceAll("amp;","");
//判断是否爬取
//String keyanv=MemcachedFactory.getKeyStr(weixinurl);
String keyanv=null;
if(StringUtils.isEmpty(keyanv)){
SiteMsgTemple site=new SiteMsgTemple();
site.setSiteUri(weixinurl);
//mqSender.sendSite(site);
System.out.println("录入mq"+weixinurl);
}else{
System.out.println("yipaqu:"+weixinurl);
}
}
return "123";
}
}
package com.zzsn.awx.service;
import com.zzsn.entity.Wxurl;
import com.zzsn.job.JedisUtil;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class ApiService {
@Autowired
private SiteService siteService;
public Boolean suo_flag=true;// 单个请求锁,只允许一个人访问
public String getNurl(String oldurl){
if(!suo_flag) {
return "0";
}
suo_flag=false;
try {
Wxurl url=new Wxurl();
url.setFlag(0L);
url.setOurl(oldurl);
//15秒内轮巡数据库 是否转换成功,
for (int i = 0; i < 15; i++) {
List<Wxurl> list1=new ArrayList<>();
if(list1.size()>0) {
Long flag=list1.get(0).getFlag();
String nurl=list1.get(0).getNurl();
if(flag.longValue()==1&& StringUtils.isNotEmpty(nurl)) {
return nurl;
}
Thread.sleep(1000L);
System.out.println(i);
}else {
// wxurlDao.save(url);
}
}
} catch (Exception e) {
e.printStackTrace();
}
suo_flag=true;
return "0";
}
public static String apiflag="apiflag";
public void zhuanhuanurl(String wxurl){
Thread t = new Thread(new Runnable(){
public void run(){
do {
try {
Wxurl wxurl=new Wxurl();
List<Wxurl> list=new ArrayList<>();
list.add(wxurl);
if(list.size()>0){
Wxurl b=list.get(0);
//判断是否已转换,已转换则不去处理
Long flag=b.getFlag();
if(1L==flag) {
System.out.println("meiyouxin_url");
Thread.sleep(5*1000);
}
String souceid=SiteService.getParambyname(b.getOurl(), "signature");
JedisUtil.setString(souceid,b.getId()+"",600);
JedisUtil.setString(apiflag+souceid,b.getId()+"",600);
//判断是否url正常
if(StringUtils.isNotEmpty(souceid)) {
siteService.sendurl(b.getOurl());
}else {
System.out.println("wentiurl"+b.getOurl());
}
}
Thread.sleep(5*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} while (true);
}
});
t.start();
}
}
\ No newline at end of file
package com.zzsn.awx.service;
import com.alibaba.fastjson.JSON;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.DateUtil;
import com.zzsn.util.WeixinUtil;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.util.*;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class SiteService {
private static final Logger Log = LoggerFactory.getLogger(SiteService.class);
private static Long id=0L;
public void sendUrlToweixin(SiteMsgTemple siteMsgTemple){
String url=siteMsgTemple.getSiteUri();
String weixinhaoid="";
try {
weixinhaoid = getParam(url);
String msg= JSON.toJSONString(siteMsgTemple);
JedisUtil.setString(":"+weixinhaoid,msg,0);
} catch (Exception e) {
e.printStackTrace();
}
if(weixinhaoid!=null&&weixinhaoid.trim().length()>0){
clearweixinhaoid(weixinhaoid);
}
//20秒发送一次链接给微信
sendurl(url);
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public synchronized void sendurl (String url){
try {
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(30*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public synchronized void sendurl1 (String url,String wxname){
try {
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(20*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public static void clearweixinhaoid(String weixinhaoid){
try {
JedisUtil.del(weixinhaoid);
}catch (Exception e){
}
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
public static String getParambyname(String url,String name) {
Map<String,String> map=parse(url);
return map.get(name);
}
public static String getweixinId(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public static String getweixinId1(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("&mid=");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public void crawlerweixin(SiteMsgTemple siteMsgTemple) throws Exception{
String weixinurl=siteMsgTemple.getSiteUri();
//判断是否yipaqu
String urlflag=JedisUtil.getString(weixinurl);
if(!StringUtils.isEmpty(urlflag)){
System.out.println("已爬取1"+weixinurl);
return;
}
//查询组织
String weixinid=getParam(weixinurl);
String organdtids=JedisUtil.getString(weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
String contentWithTag = "";
contentNoTag = ContentUtility.TransferHTML2Text(contentWithTag);
DocInfo docInfo=new DocInfo();
docInfo.setSourceType("WeChat");
// docInfo.setLastModified(lastModified);
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
// docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
Iterator<String> it =null;
// Iterator<String> it = organdtids.iterator();
while (it.hasNext()) {
String str = it.next();
System.out.println(str);
//解析ITD ORGID
String[] ss= str.split("-");
if(ss.length!=3){
return;
}else{
System.out.println(str);
System.out.println(ss.toString());
}
String orgid=ss[0];
String tid=ss[1];
String sid=ss[2];
docInfo.setOrgId(Long.valueOf(orgid));
docInfo.setSid(Long.valueOf(sid));
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "weixincraw");
if (null!=tid&&!"null".equals(tid)) {
params.put("tid", tid);
}
docInfo.setOtherParams(params);
String week = DateUtil.getDateBeforeDays(new Date() , 2);
if(docInfo.getTitle()==null){
if(StringUtils.isEmpty(contentNoTag)){
//空了继续爬 不空爬不下来记录了
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}else if(docInfo.getPublishDate().compareTo(week)<0){
//1天前外事办项目不推
System.out.println("时间过期"+docInfo.getPublishDate());
JedisUtil.setString(weixinurl, 1+"",0);
//其他项目还是推
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}
}
public static void main(String[] args) {
String s="https://mp.weixin.qq.com/mp/profile_ext?action=home&scene=114&__biz=MzAwODE2OTAwNg==#wechat_redirect";
System.out.println(getParam(s));
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
System.out.println(ss);
ss=getParam(s);
System.out.println(ss);
String time="2019-11-18 12:20:23";
String week = DateUtil.getDateBeforeDays(new Date() , 1);
System.out.println(time.compareTo(week));
String sss="http://mp.weixin.qq.com/s?__biz=MzUxMzEzNjg1Ng==&mid=2247484003&idx=1&sn=965ca574850ab65be466c443bf8e2a3b&scene=0965ca574850ab65be466c443bf8e2a3b";
sss=getParambyname(sss, "signature");
System.out.println(sss);
}
}
\ No newline at end of file
package com.zzsn.common;
/**
* 微信通用接口凭证
*
* @author liufeng
* @date 2013-08-08
*/
public class AccessToken {
// 获取到的凭证
private String token;
// 凭证有效时间,单位:秒
private int expiresIn;
public String getToken() {
return token;
}
public void setToken(String token) {
this.token = token;
}
public int getExpiresIn() {
return expiresIn;
}
public void setExpiresIn(int expiresIn) {
this.expiresIn = expiresIn;
}
}
package com.zzsn.common;
import javax.net.ssl.X509TrustManager;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
/**
* 证书信任管理器(用于https请求)
*
* @author liufeng
* @date 2013-08-08
*/
public class MyX509TrustManager implements X509TrustManager {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return null;
}
}
\ No newline at end of file
package com.zzsn.common;
public class TextContent {
private String content;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
package com.zzsn.common.cache;
import java.util.HashMap;
import java.util.Map;
/**
*
* @ClassName: IGatRedisKey
* @Description:
* @author: renkai721
* @date: 2018年6月25日 下午4:54:42
*/
public interface IGatRedisKey {
public static Map<String, String> register_map = new HashMap<String, String>();
public final String NO = "no";
public final String OK = "ok";
public final String HTTP = "http://";
public final String F = ";";
public final String _ = "_";
}
package com.zzsn.common.cache;
import com.google.code.yanf4j.core.impl.StandardSocketOption;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.command.BinaryCommandFactory;
import net.rubyeye.xmemcached.transcoders.SerializingTranscoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Memcached客户端
*/
public class Memcached {
private static Logger logger = LoggerFactory.getLogger(Memcached.class);
private static MemcachedClient client = null;
static {
logger.debug("memcached initialize...");
Properties prop = getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
int[] weights = new int[servers.length];
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
String weight = prop.getProperty("memcached.server"+(i+1)+".weight");
if (weight == null || weight.isEmpty()) {
weights[i] = 1;
} else {
weights[i] = Integer.parseInt(weight);
}
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList, weights);
String poolSize = prop.getProperty("memcached.connectionPoolSize");
if (poolSize != null && !poolSize.isEmpty()) {
builder.setConnectionPoolSize(Integer.parseInt(poolSize));
}
String failureMode = prop.getProperty("memcached.failureMode");
if (failureMode != null && !failureMode.isEmpty()) {
builder.setFailureMode(Boolean.parseBoolean(failureMode));
}
String connTimeout = prop.getProperty("memcached.connectTimeout");
if (connTimeout != null && !connTimeout.isEmpty()) {
builder.setConnectTimeout(Integer.parseInt(connTimeout));
}
String opTimeout = prop.getProperty("memcached.opTimeout");
if (opTimeout != null && !opTimeout.isEmpty()) {
builder.setOpTimeout(Integer.parseInt(opTimeout));
}
String enableHealSession = prop.getProperty("memcached.enableHealSession");
if (enableHealSession != null && !enableHealSession.isEmpty()) {
builder.setEnableHealSession(Boolean.parseBoolean(enableHealSession));//启用或者禁止连接修复
}
String statistics = prop.getProperty("memcached.statistics");
if (statistics != null && !statistics.isEmpty()) {
builder.getConfiguration().setStatisticsServer(Boolean.parseBoolean(statistics));
}
String binary = prop.getProperty("memcached.binaryCommand");
if (binary != null && "true".equals(binary)) {
builder.setCommandFactory(new BinaryCommandFactory());
}
builder.setTranscoder(new SerializingTranscoder());
builder.setSocketOption(StandardSocketOption.SO_RCVBUF, 32* 1024);// 设置接收缓存区为32K,默认16K
builder.setSocketOption(StandardSocketOption.SO_SNDBUF,16 *1024); // 设置发送缓冲区为16K,默认为8K
builder.setSocketOption(StandardSocketOption.TCP_NODELAY,true); // 启用nagle算法,提高吞吐量,默认关闭
String sessionIdleTimeout = prop.getProperty("memcahced.sessionIdleTimeout");
if (sessionIdleTimeout != null && !sessionIdleTimeout.isEmpty()) {
builder.getConfiguration().setSessionIdleTimeout(Integer.parseInt(sessionIdleTimeout)*1000); // 如果连接超过x秒没有任何IO操作发生即认为空闲并发起心跳检测
}
try {
client = builder.build();
String optimizeMergeBuffer = prop.getProperty("memcached.optimizeMergeBuffer");
if (optimizeMergeBuffer != null && !optimizeMergeBuffer.isEmpty()) {
client.setOptimizeMergeBuffer(Boolean.parseBoolean(optimizeMergeBuffer));
}
String mergeFactor = prop.getProperty("memcached.mergeFactor");
if (mergeFactor != null && !optimizeMergeBuffer.isEmpty()) {
client.setMergeFactor(Integer.parseInt(mergeFactor));
}
logger.debug("memcached initialize completed!");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private Memcached() {
}
public static MemcachedClient getClient() {
return client;
}
public static void shutdown(MemcachedClient client) {
if (client != null) {
try {
client.shutdown();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static Properties getConfig() {
Properties properties = new Properties();
InputStream is = null;
String location = "memcached.properties";
try {
Resource resource = new DefaultResourceLoader().getResource(location);
is = resource.getInputStream();
properties.load(is);
logger.debug("memcached config: {}", properties.toString());
} catch (IOException ex) {
logger.error("Could not load property file:" + location, ex);
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException ioe) {
// ignore
}
}
return properties;
}
}
package com.zzsn.common.cache;
import net.rubyeye.xmemcached.KeyIterator;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
/**
* @Author Sugar
* @Version 2018/6/8 13:22
*/
public class MemcachedAdmin {
private static Logger logger = LoggerFactory.getLogger(MemcachedAdmin.class);
private static MemcachedClient client = null;
private MemcachedAdmin() {
}
private static MemcachedClient getClient() {
if (client != null) {
return client;
}
Properties prop = Memcached.getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList);
try {
client = builder.build();
} catch (IOException e) {
e.printStackTrace();
}
return client;
}
@Deprecated
public static long deleteAll(String keyPrefix) {
AtomicLong count = new AtomicLong();
MemcachedClient client = getClient();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
boolean result = client.delete(key);
long i = count.incrementAndGet();
if (logger.isDebugEnabled()) {
logger.debug("[{}] Delete key[{}]: {}={}", inet, i, key, result);
}
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
logger.info("Delete a total of {} keys starting with {}", count.get(), keyPrefix);
return count.get();
}
@Deprecated
public static List<String> getAllKey(String keyPrefix) {
MemcachedClient client = getClient();
List<String> keys = new ArrayList<>();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
keys.add(key);
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
return keys;
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port) {
return addServer(host, port, 1);
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port, int weight) {
try {
Memcached.getClient().addServer(host, port, weight);
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
}
package com.zzsn.common.cache;
import io.protostuff.*;
import io.protostuff.runtime.RuntimeSchema;
import org.springframework.objenesis.Objenesis;
import org.springframework.objenesis.ObjenesisStd;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @Author Sugar
* @Version 2018/3/16 13:32
*/
public class ProtostuffUtil {
public static final Objenesis objenesis = new ObjenesisStd(true);
private static final String ERR_TRUNCATED_MESSAGE =
"While parsing a protocol message, the input ended unexpectedly " +
"in the middle of a field. This could mean either than the " +
"input has been truncated or that an embedded message " +
"misreported its own length.";
/**
* 序列化
*
* @param obj
* @return
*/
public static <T> byte[] serializer(T obj) {
LinkedBuffer buffer = LinkedBuffer.allocate(LinkedBuffer.DEFAULT_BUFFER_SIZE);
try {
Schema<T> schema = RuntimeSchema.getSchema((Class<T>) obj.getClass());
return ProtostuffIOUtil.toByteArray(obj, schema, buffer);
} catch (Exception e) {
throw new IllegalStateException("序列化对象失败:" + obj, e);
} finally {
buffer.clear();
}
}
/**
* 反序列化
*
* @param data
* @param clazz
* @return
*/
public static <T> T deserializer(byte[] data, Class<T> clazz) {
T obj = null;
try {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
// obj = schema.newMessage();
obj = objenesis.newInstance(clazz);
ProtostuffIOUtil.mergeFrom(data, obj, schema);
} catch (Exception e) {
throw new IllegalStateException("反序列化对象失败:class=" + clazz + ", data=" + new String(data), e);
}
return obj;
}
public static <T> byte[] serializeList(List<T> list) {
@SuppressWarnings("unchecked")
Schema<T> schema = (Schema<T>) RuntimeSchema.getSchema(list.get(0).getClass());
LinkedBuffer buffer = LinkedBuffer.allocate(1024 * 1024);
ByteArrayOutputStream bos = null;
try {
bos = new ByteArrayOutputStream();
ProtostuffIOUtil.writeListTo(bos, list, schema, buffer);
return bos.toByteArray();
} catch (Exception e) {
throw new IllegalStateException("序列化对象列表失败:" + list, e);
} finally {
buffer.clear();
try {
if (bos != null) {
bos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 反序列化对象列表
*
* @param data
* @param clazz
* @param <T>
* @return
*/
public static <T> List<T> deserializeList(byte[] data, Class<T> clazz) {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
List<T> result = null;
try {
result = parseListFrom(new ByteArrayInputStream(data), schema, clazz);
} catch (IOException e) {
throw new IllegalStateException("反序列化对象列表失败:class=" + clazz + ", data=" + new String(data), e);
}
return result;
}
private static <T> List<T> parseListFrom(final InputStream in, final Schema<T> schema, Class<T> clazz)
throws IOException {
int size = in.read();
if (size == -1) {
return Collections.emptyList();
}
if (size > 0x7f) {
size = readRawVarint32(in, size);
}
final ArrayList<T> list = new ArrayList<T>(size);
final CodedInput input = new CodedInput(in, true);
for (int i = 0; i < size; i++) {
// final T message = schema.newMessage();
final T message = objenesis.newInstance(clazz);//使用objensis代替newInstance()
list.add(message);
schema.mergeFrom(input, message);
input.checkLastTagWas(0);
}
assert in.read() == -1;
return list;
}
/**
* Reads a varint from the input one byte at a time, so that it does not read any bytes after the end of the varint.
* If you simply wrapped the stream in a CodedInput and used readRawVarint32(InputStream) then you would
* probably end up reading past the end of the varint since CodedInput buffers its input.
*/
private static int readRawVarint32(final InputStream input, final int firstByte) throws IOException {
int result = firstByte & 0x7f;
int offset = 7;
for (; offset < 32; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
result |= (b & 0x7f) << offset;
if ((b & 0x80) == 0) {
return result;
}
}
// Keep reading up to 64 bits.
for (; offset < 64; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
if ((b & 0x80) == 0) {
return result;
}
}
throw new ProtobufException(
"CodedInput encountered a malformed varint.");
}
}
//package com.zzsn.configuration;
//
//import com.baomidou.mybatisplus.autoconfigure.ConfigurationCustomizer;
//import com.baomidou.mybatisplus.extension.plugins.PaginationInterceptor;
//import org.springframework.context.annotation.Bean;
//import org.springframework.context.annotation.Configuration;
//
//
//@Configuration
//public class MybatisPlusConfig {
//
// /**
// * 新的分页插件,一缓和二缓遵循mybatis的规则,需要设置 MybatisConfiguration#useDeprecatedExecutor = false 避免缓存出现问题
// */
//// @Bean
//// public MybatisPlusInterceptor mybatisPlusInterceptor() {
//// MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor();
//// //注释下面的可能出现获取不到总数的效果
//// interceptor.addInnerInterceptor(new PaginationInnerInterceptor(DbType.MYSQL));
//// return interceptor;
//// }
// @Bean
// public PaginationInterceptor paginationInterceptor() {
// return new PaginationInterceptor();
// }
//
// @Bean
// public ConfigurationCustomizer configurationCustomizer() {
// return configuration -> configuration.setUseDeprecatedExecutor(false);
// }
//
//}
\ No newline at end of file
package com.zzsn.configuration;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.TimeUnit;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import okhttp3.ConnectionPool;
import okhttp3.OkHttpClient;
@Configuration
public class OkHttpConfiguration {
@Bean
public OkHttpClient okHttpClient() {
return new OkHttpClient.Builder()
// .sslSocketFactory(sslSocketFactory(), x509TrustManager())
.retryOnConnectionFailure(false).connectionPool(pool()).connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS).writeTimeout(30, TimeUnit.SECONDS).build();
}
@Bean
public X509TrustManager x509TrustManager() {
return new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
};
}
@Bean
public SSLSocketFactory sslSocketFactory() {
try {
// 信任任何链接
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, new TrustManager[] { x509TrustManager() }, new SecureRandom());
return sslContext.getSocketFactory();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyManagementException e) {
e.printStackTrace();
}
return null;
}
/**
* Create a new connection pool with tuning parameters appropriate for a single-user
* application. The tuning parameters in this pool are subject to change in future OkHttp
* releases. Currently
*/
@Bean
public ConnectionPool pool() {
return new ConnectionPool(200, 5, TimeUnit.MINUTES);
}
}
//package com.zzsn.configuration;
//
//import org.springframework.context.annotation.Configuration;
//import org.springframework.scheduling.annotation.SchedulingConfigurer;
//import org.springframework.scheduling.config.ScheduledTaskRegistrar;
//
//import java.util.concurrent.Executors;
//
//@Configuration
//public class ScheduleConfig implements SchedulingConfigurer {
// @Override
// public void configureTasks(ScheduledTaskRegistrar taskRegistrar) {
// //当然了,这里设置的线程池是corePoolSize也是很关键了,自己根据业务需求设定
// taskRegistrar.setScheduler(Executors.newScheduledThreadPool(1));
//
//
// /**为什么这么说呢?
// 假设你有4个任务需要每隔1秒执行,而其中三个都是比较耗时的操作可能需要10多秒,而你上面的语句是这样写的:
// taskRegistrar.setScheduler(Executors.newScheduledThreadPool(3));
// 那么仍然可能导致最后一个任务被阻塞不能定时执行
// **/
// }
//}
package com.zzsn.configuration;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
import java.util.Map;
/**
* 获取Spring的ApplicationContext对象工具,可以用静态方法的方式获取spring容器中的bean
* @author https://blog.csdn.net/chen_2890
* @date 2019/6/26 16:20
*/
@Component
public class SpringContextUtil implements ApplicationContextAware {
private static ApplicationContext applicationContext;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
SpringContextUtil.applicationContext = applicationContext;
}
/**
* 获取applicationContext
*/
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
/**
* 通过name获取 Bean.
*/
public static Object getBean(String name) {
Object o = null;
try {
o = getApplicationContext().getBean(name);
} catch (NoSuchBeanDefinitionException e) {
// e.printStackTrace();
}
return o;
}
/**
* 通过class获取Bean.
*/
public static <T> T getBean(Class<T> clazz) {
return getApplicationContext().getBean(clazz);
}
/**
* 通过name,以及Clazz返回指定的Bean
*/
public static <T> T getBean(String name, Class<T> clazz) {
return getApplicationContext().getBean(name, clazz);
}
/**
* 通过name获取 Bean.
*/
public static <T> Map<String, T> getBeansOfType(Class<T> clazz) {
return getApplicationContext().getBeansOfType(clazz);
}
/**
* 获取配置文件配置项的值
*
* @param key 配置项key
*/
public static String getEnvironmentProperty(String key) {
return getApplicationContext().getEnvironment().getProperty(key);
}
/**
* 获取spring.profiles.active
*/
public static String getActiveProfile() {
return getApplicationContext().getEnvironment().getActiveProfiles()[0];
}
}
package com.zzsn.configuration;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
@Configuration
@EnableAsync
public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor")
public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
}
\ No newline at end of file
package com.zzsn.crawler;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.awx.service.SiteService;
import org.springframework.beans.BeansException;
import java.util.TimerTask;
public class SiteTask extends TimerTask {
@Override
public void run() {
System.out.println("开是发送信息");
task();
}
public static void task(){
try {
SiteService sites= SpringContextUtil.getBean(SiteService.class);
// sites.sendUrlToweixin("");
} catch (BeansException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.zzsn.crawler;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.kafka.core.KafkaTemplate;
import java.util.HashMap;
import java.util.Map;
@Slf4j
public class WeixinDetailThread extends Thread{
public SiteMsgTemple siteMsgTemple =new SiteMsgTemple();
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Override
public void run() {
detailCrawler();
}
public boolean detailCrawler(){
boolean flag=false;
String weixinurl = siteMsgTemple.getSiteUri();
//判断是否已爬取
try {
String urlflag = JedisUtil.getString(weixinurl);
if (!StringUtils.isEmpty(urlflag)) {
log.info("已爬取" + weixinurl);
return flag;
}
}catch (Exception e){
log.info("redis获取信息失败");
}
String weixinid=getParam(weixinurl);
log.info("爬取的微信id= "+weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
// System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
String key2="";
if(key.contains("original")){
key2 = "original";
}else if(key.contains("data-src")){
key2 = "data-src";
}else if(key.contains("_src")){
key2 = "_src";
}else if(key.contains("src")){
key2 ="src";
}
String key3=key.replace(key2,"src");
// formatImgContent = formatImgContent.replace(key, key3);
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
DocInfo docInfo=new DocInfo();
docInfo.setSid(Long.parseLong(siteMsgTemple.getId()));
docInfo.setSourceType("WeChat");
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
contentNoTag=ContentUtility.TransferHTML2Text(sb.toString());
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
ObjectMapper mapper = new ObjectMapper();
try {
ClbAnsProcessitem processitem =docInfoTrans2Processitem(docInfo);
if(StringUtils.isEmpty(processitem.getTitle())|| StringUtils.isEmpty(processitem.getContent())){
System.out.println("资讯的信息不全没有发送");
}
String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
flag=true;
//标记已爬取
JedisUtil.setString(weixinurl,"1",-1);
} catch (Exception e) {
// e.printStackTrace();
log.info("发送到kafka失败。");
}
return flag;
}
public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
return clbAnsProcessitem;
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
}
package com.zzsn.crawler;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.awx.service.SiteService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.kafka.core.KafkaTemplate;
@Slf4j
public class WeixinSiteThread extends Thread{
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Override
public void run() {
crawler();
}
public synchronized void crawler(){
//发送公众号链接到手机微信
SiteService sites= SpringContextUtil.getBean(SiteService.class);
sites.sendUrlToweixin(siteMsgTemple);
try {
Thread.sleep(1000*60*2);
}catch (Exception e){
}
}
}
package com.zzsn.entity;
import lombok.Data;
import java.util.List;
@Data
public class ClbAnsProcessitem {
/**主键*/
private String id;
/**信息源id*/
private String sid;
/**团队id*/
private String tid;
/**标题*/
private String title;
/**摘要*/
private String summary;
/**关键词*/
private String keyWords;
/**正文*/
private String content;
private String contentWithtag;
/**未知*/
private String hash;
/**作者*/
private String author;
/**来源*/
private String sourceSite;
/**地址*/
private String sourceAddress;
/**未知*/
private String currentProcess;
/**类别*/
private String type;
/**未知*/
private String withTagFile;
/**发布时间*/
private String publishDate;
/**创建人*/
private String createBy;
/**创建时间*/
private String createDate;
/**编码*/
private String charset;
/**未知*/
private Integer processResult;
/**最新更新时间*/
private String lastModified;
/**组织id*/
private String orgId;
/**词*/
private String words;
/**来源*/
private String origin;
/**未知*/
private String orientation;
/**来源*/
private String fromWhere;
/**来源id*/
private String fromId;
/**来源类别*/
private String sourceType;
/**未知*/
private String featureWords;
/**下载地址*/
private String fileDownloadPath;
private String contentImgCvtTag;
/**关联地址*/
private String relatePlaces;
/**关联人*/
private String relatePerson;
/**关联组织*/
private String relateOrg;
/**事件*/
private String relateEvent;
/**时间*/
private String relateDate;
/**未知*/
private Integer relevance1;
/**未知*/
private String relevance;
/**语言*/
private String lang;
/**组织*/
private String orgs;
/**(临时处理)关联的专题id*/
private List<String> subjectIds;
}
\ No newline at end of file
package com.zzsn.entity;
import lombok.Data;
import java.io.Serializable;
import java.util.Map;
/**
* 数据接口文档
* 创建人:李东亮
* 创建时间:2016-4-6 下午3:44:17
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Data
public class DocInfo implements Serializable{
private static final long serialVersionUID = 1L;
public String id;
private String contentType;
private Long orgId;
private Long sid;
//News:新闻,BBS:论坛,Blog:博客,MicroBlog:微博,WeChat:微信,Video:视频,Other:其他
private String sourceType;
private String lastModified;
private String charset;
private String sourceaddress;
private String lang;
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
private String fileDownLoadPath;
private Map<String,String> otherParams;
//public Long getOrgId() {
// return orgId;
//}
//public void setOrgId(Long orgId) {
// this.orgId = orgId;
//}
//public String getContentType() {
// return contentType;
//}
//public void setContentType(String contentType) {
// this.contentType = contentType;
//}
//public Long getSid() {
// return sid;
//}
//public void setSid(Long sid) {
// this.sid = sid;
//}
//public String getSourceType() {
// return sourceType;
//}
//public void setSourceType(String sourceType) {
// this.sourceType = sourceType;
//}
//public String getLastModified() {
// return lastModified;
//}
//public void setLastModified(String lastModified) {
// this.lastModified = lastModified;
//}
//public String getCharset() {
// return charset;
//}
//public void setCharset(String charset) {
// this.charset = charset;
//}
//public String getSourceaddress() {
// return sourceaddress;
//}
//public void setSourceaddress(String sourceaddress) {
// this.sourceaddress = sourceaddress;
//}
//public String getLang() {
// return lang;
//}
//public void setLang(String lang) {
// this.lang = lang;
//}
//public String getTitle() {
// return title;
//}
//public void setTitle(String title) {
// this.title = title;
//}
//public String getAuthor() {
// return author;
//}
//public void setAuthor(String author) {
// this.author = author;
//}
//public String getPublishDate() {
// return publishDate;
//}
//public void setPublishDate(String publishDate) {
// this.publishDate = publishDate;
//}
//public String getOrigin() {
// return origin;
//}
//public void setOrigin(String origin) {
// this.origin = origin;
//}
//
//public String getKeywords() {
// return keywords;
//}
//public void setKeywords(String keywords) {
// this.keywords = keywords;
//}
//public String getSummary() {
// return summary;
//}
//public void setSummary(String summary) {
// this.summary = summary;
//}
//public String getContentWithTag() {
// return contentWithTag;
//}
//public void setContentWithTag(String contentWithTag) {
// this.contentWithTag = contentWithTag;
//}
//public String getContentNoTag() {
// return contentNoTag;
//}
//public void setContentNoTag(String contentNoTag) {
// this.contentNoTag = contentNoTag;
//}
//public Map<String, String> getOtherParams() {
// return otherParams;
//}
//public void setOtherParams(Map<String, String> otherParams) {
// this.otherParams = otherParams;
//}
//public String getFileDownLoadPath() {
// return fileDownLoadPath;
//}
//public void setFileDownLoadPath(String fileDownLoadPath) {
// this.fileDownLoadPath = fileDownLoadPath;
//}
//public String getContentImgCvtTag() {
// return contentImgCvtTag;
//}
//public void setContentImgCvtTag(String contentImgCvtTag) {
// this.contentImgCvtTag = contentImgCvtTag;
//}
//
}
package com.zzsn.entity;
import lombok.Data;
import java.util.Date;
@Data
public class SiteMsgRecord {
/**信息源id*/
String infoSourceId;
/**信息源编码*/
String code;
/**本次采集数*/
Integer num;
/**本次开始采集时间*/
Date collectTime;
/**调度时间*/
Date dispatcherTime;
/**调度状态(0:失败 1:成功)*/
String dispatcherStatus;
/**信息来源(1:采集 2.调度)*/
String source;
}
package com.zzsn.entity;
import lombok.Data;
import java.io.Serializable;
import java.util.regex.Pattern;
@Data
public class SiteMsgTemple implements Serializable {
private static final long serialVersionUID = 1L;
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**语种*/
private String language;
/**境外、公共、翻墙*/
private String checkedList;
/**历史数据URL*/
private String hisUriExp;
/**历史数据开始时间*/
private java.util.Date hisDateStartTime;
/**历史数据结束时间*/
private java.util.Date hisDateEndTime;
/**是否历史所有数据*/
private String ynHisDataAll;
/**网站级别*/
private String siteLevel;
/**状态*/
private Integer status;
/**列表页URL*/
private String listUrl;
/**表达式类型*/
private String listExpressionType;
/**
* 列表信息块位置
*/
private String infoBlockPosition;
/**
*抽取链接定位
*/
private String linkLocation;
/**匹配资讯的列表*/
private String informationItem;
/**匹配资讯的url*/
private String informationUrl;
/**匹配资讯标题*/
private String informationTitle;
/**匹配资讯发布时间*/
private String informationPublishDate;
/**匹配资讯来源*/
private String informationSource;
/**自定义实体*/
private Object extractInfo;
/**爬取深度*/
private Integer crawlDepth;
/**页码url*/
private String pageUrl;
/**匹配页码*/
private String matchPage;
/**开始页码*/
private Integer pageStart;
/**结束页码*/
private Integer pageEnd;
/**是否所有页*/
private String ynPageAll;
/**表达式类型*/
private String detailExpressionType;
/**详情页表URL*/
private String detailUrl;
/**匹配详情页标题*/
private String detailExpressionTitle;
/**匹配详情页时间*/
private String detailExpressionPublishDate;
/**匹配详情页来源*/
private String detailExpressionSource;
/**匹配详情页作者*/
private String detailExpressionAuthor;
/**匹配详情页摘要*/
private String detailExpressionSummary;
/**匹配详情页正文*/
private String detailExpressionContent;
/**自定义实体*/
private Object detailInfo;
/**是否下载附件*/
private String ynDownload;
/**数据表格页URL*/
private String formUrl;
/**数据表格标题*/
private String formTitle;
/**表达式类型*/
private Integer formType;
/**数据表格表达式*/
private String dataFormExpression;
/**自定义*/
private Object dataFormInfo;
/**页码URL*/
private String dataPageUrl;
/**页码规则*/
private String dataPageRule;
/**开始页码*/
private Integer dataPageStart;
/**结束页码*/
private Integer dataPageEnd;
/**是否所有页码*/
private String ynDataPageAll;
/**数据类型*/
private Integer dataType;
/**数据格式*/
private Integer dataFormat;
/**数据存储方式*/
private Integer dataStorageMode;
/**数据存储信息*/
private Object dataStorageInfo;
/**是否动态爬取*/
private Integer ynDynamicCrawl;
/**是否需要登陆*/
private Integer ynLogin;
/**登陆域名*/
private String domainName;
/**登陆链接*/
private String link;
/**登陆账号*/
private String account;
/**登陆密码*/
private String password;
/**userAgent*/
private String userAgent;
/**referer*/
private String referer;
/**cookies*/
private String cookies;
/**headers*/
private String headers;
/**其它参数*/
private String otherInfo;
/**爬虫类别*/
private Integer crawlType;
/**爬虫名称*/
private String crawlName;
/**爬虫地址*/
private String crawlAddress;
/**参数*/
private Object parameter;
/**cron表达式*/
private String cron;
//++++++++++++++++++++++++++++++++++++++++++++++++++
private Pattern pattern;
}
package com.zzsn.entity;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
/**
* 微信转换
* 创建人:李华伟
* 创建时间:2016-8-10 上午10:56:41
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@JsonSerialize(include= JsonSerialize.Inclusion.NON_NULL)
public class Wxurl implements Cloneable {
//columns START
/**
* id db_column: ID
*/
private Long id;
/**
* rid 是否转换标识
*/
private Long flag;
/**
* 要转换的URL
*/
private String ourl;
/**
* uri 转换后url
*/
private String nurl;
/**
* category db_column: CATEGORY
*/
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getFlag() {
return flag;
}
public void setFlag(Long flag) {
this.flag = flag;
}
public String getOurl() {
return ourl;
}
public void setOurl(String ourl) {
this.ourl = ourl;
}
public String getNurl() {
return nurl;
}
public void setNurl(String nurl) {
this.nurl = nurl;
}
}
package com.zzsn.extractor;
import com.zzsn.util.DateUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获取正文中的图片或者文件
* 创建人:李东亮
* 创建时间:2016-8-30 下午5:25:04
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ContentFileFinder {
/**
* 获取父路径
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String getDirPath(String path) {
path = path.substring(0, path.lastIndexOf("/")) ;
return path;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String formatPath(String currentPageURL,String imgPath) {
String start="";
if(currentPageURL.indexOf("http://")!=-1){
start = "http://";
}else if(currentPageURL.indexOf("https://")!=-1){
start = "https://";
}
//绝对路径
if(imgPath.startsWith("/")){
//add lihuawei 增加双斜杠判断图片 如果开始时双斜杠就增加http:
if(imgPath.startsWith("//")){
return start+imgPath.replace("//", "");
}
currentPageURL = currentPageURL.replace(start, "");
int subIndex = currentPageURL.indexOf("/");
if(subIndex==-1){
subIndex = currentPageURL.length();
}
String domain = currentPageURL.substring(0, subIndex);
return start+domain+imgPath;
}
//相对路径
String path = currentPageURL+"/"+imgPath;
path = path.replaceAll(start, "D:/");
File f = new File(path);
String filePath="";
try {
filePath = f.getCanonicalPath();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String result = filePath.replaceAll("D:\\\\", start);
result = result.replaceAll("\\\\", "/");
return result;
}
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private static String genImgFileName(String suffix){
String dir = DateUtil.format(new Date(), "yyyy-MM-dd");
String uuid = UUID.randomUUID().toString();
return dir+"/"+uuid+suffix;
}
/**
* 确保有src属性并且src属性指向正确的图片地址
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午1:46:03
* @version 1.0
* @param rawTag
* @return
*/
public static Element ensureSrc(Element imgTag){
// Document doc = Jsoup.parseBodyFragment(rawTag);
String firstSrcAtt=null;
if(imgTag.hasAttr("original")){
firstSrcAtt = "original";
}else if(imgTag.hasAttr("data-src")){
firstSrcAtt = "data-src";
}else if(imgTag.hasAttr("_src")){
firstSrcAtt = "_src";
}else if(imgTag.hasAttr("src")){
firstSrcAtt ="src";
}
if(firstSrcAtt==null){
return null;
}
imgTag.attr("src", imgTag.attr(firstSrcAtt));
return imgTag;
}
/**
* 获取图片的绝对路径
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午2:05:02
* @version 1.0
* @param element
* @param uri
* @return
*/
private static String getAbsolutePath(Element element,String uri,String linkAtt){
String absolutePath = element.attr(linkAtt);
if(absolutePath.startsWith("data:image")){
return null;
}
if(absolutePath.startsWith("file:")){
return null;
}
if (absolutePath.matches("(?i)^javascript.*|#")) {
return null;
}
if(!absolutePath.startsWith("http://")&&!absolutePath.startsWith("https://")&&uri!=null){
String puriDir = getDirPath(uri);
absolutePath = formatPath(puriDir,absolutePath);
}
return absolutePath;
}
/**
* 获取后缀名
* 创建人: 李东亮
* 创建时间: 2016-8-30 下午5:00:39
* @version 1.0
* @param uri
* @return
*/
public static String getSuffix(String uri){
uri = uri.replaceAll("http://|https://", "");
Pattern p = Pattern.compile("/.+(\\.\\w{1,4})$");
Matcher m = p.matcher(uri);
if(m.find()){
return m.group(1);
}
return "";
}
/**
* 获取正文中的文件标签,包含正文中的图片和附件
* 创建人: 李东亮
* 创建时间: 2016-9-8 下午3:01:09
* @version 1.0
* @param content
* @param sourceaddress
* @return
*/
public static Map<String,FileTag> getContentFileTag(String content,String sourceaddress){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img,a");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml();
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = getAbsolutePath(imgTag,sourceaddress,filePathAttr);
if(absolutePath==null){
continue;
}
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
public static Map<String,FileTag> getContentFileTag(String content){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img,a");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml();
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = "";
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
public static void main(String[] args) {
String str = "<img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\"点击图片查看幻灯模式\" original2=\"http://img3.laibafile.cn/p/l/246500759.jpg\" />";
System.out.println(ContentFileFinder.getSuffix("http://www.baidu.com//a.xls"));
}
}
package com.zzsn.extractor;
import java.io.File;
/**
* 需要分析抽取的实体
* 创建人:李东亮
* 创建时间:2016-4-7 下午3:20:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ExtEntity {
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
// private String html;
// private String charset;
private File file;
private String fileDownLoadPath;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getOrigin() {
return origin;
}
public void setOrigin(String origin) {
this.origin = origin;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContentWithTag() {
return contentWithTag;
}
public void setContentWithTag(String contentWithTag) {
this.contentWithTag = contentWithTag;
}
public String getContentNoTag() {
return contentNoTag;
}
public void setContentNoTag(String contentNoTag) {
this.contentNoTag = contentNoTag;
}
public String getContentImgCvtTag() {
return contentImgCvtTag;
}
public void setContentImgCvtTag(String contentImgCvtTag) {
this.contentImgCvtTag = contentImgCvtTag;
}
/* public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}*/
public File getFile() {
return file;
}
public void setFile(File file) {
this.file = file;
}
public String getFileDownLoadPath() {
return fileDownLoadPath;
}
public void setFileDownLoadPath(String fileDownLoadPath) {
this.fileDownLoadPath = fileDownLoadPath;
}
}
package com.zzsn.extractor;
import java.io.InputStream;
/**
* web抽取类
* 创建人:李东亮
* 创建时间:2016-9-13 上午11:32:28
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public interface Extractor {
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream);
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-9-13 上午11:34:58
* @version 1.0
* @param entity
*/
public void process(ExtEntity entity) throws Exception;
/**
* 获取内容
* 创建人: 李东亮
* 创建时间: 2016-9-18 上午10:23:50
* @version 1.0
* @return
*/
public String getContent();
/**
* 获取字符集
* 创建人: 李东亮
* 创建时间: 2016-9-18 上午10:24:09
* @version 1.0
* @return
*/
public String getCharset();
}
package com.zzsn.extractor;
/**
* img传输对象
* 创建人:李东亮
* 创建时间:2015-7-6 下午4:51:51
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class FileTag {
//src路径如果为相对路径,则转换为绝对路径
private String absolutePath;
//src路径转换为绝对路径之后的标签
private String absoluteTag;
//用于JSP替换
private String saveTag;
//图片保存路径
private String savePath;
//下载文件名
private String fileName;
public String getAbsolutePath() {
return absolutePath;
}
public void setAbsolutePath(String absolutePath) {
this.absolutePath = absolutePath;
}
public String getAbsoluteTag() {
return absoluteTag;
}
public void setAbsoluteTag(String absoluteTag) {
this.absoluteTag = absoluteTag;
}
public String getSaveTag() {
return saveTag;
}
public void setSaveTag(String saveTag) {
this.saveTag = saveTag;
}
public String getSavePath() {
return savePath;
}
public void setSavePath(String savePath) {
this.savePath = savePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
}
package com.zzsn.extractor;
public class TestWxDispatch {
public static void main(String[] args) {
WeiXinDispatch wx=new WeiXinDispatch();
String url="http://mp.weixin.qq.com/s?__biz=MzAwODE2OTAwNg==&mid=2652292603&idx=2&sn=ef7a35347bc9dceb79c0fa42f8840f79&chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&scene=27#wechat_redirect";
ExtEntity en=wx.getExtractorElement(url);
System.out.println(en.getAuthor());
System.out.println(en.getContentImgCvtTag());
System.out.println(en.getContentWithTag());
System.out.println(en.getTitle());
System.out.println(en.getPublishDate());
System.out.println(en.getContentNoTag());
}
}
package com.zzsn.extractor;
import com.zzsn.extractor.web.Processor;
import com.zzsn.util.*;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正文抽取处理类.抽取标题,摘要,正文,作者,字符集
* 创建人:李东亮
* 创建时间:2015-5-11 下午3:28:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class WebExtractorImplforweixin implements Extractor{
private static final Logger Log = LoggerFactory.getLogger(WebExtractorImplforweixin.class);
//抽取类型
public enum EXT_TYPE {
CONTENT, TITLE, KEYWORDS, SUMMARY, AUTHOR, PUBLISH_DATE, ORIGIN
}
private List<Processor> processors;
private HttpResponse getMethod;
private String html;
private String charset;
public WebExtractorImplforweixin(List<Processor> processors,HttpResponse getMethod){
this.processors = processors;
this.getMethod = getMethod;
}
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream) {
try{
html = FileUtil.readHtml(inputStream, Constants.READ_CHARSET);
Header header = getMethod.getFirstHeader("Content-Type");
charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
}catch(Exception e){
return false;
}
if(html==null){
return false;
}else{
return true;
}
}
/**
* 获取带标签正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:04:01
* @version 1.0
* @param body
* @return
*/
private String getContentWithTag(String body) {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(body);
return contentWithTag;
}
/**
* 获取不带标签的正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:06:32
* @version 1.0
* @param contentWithTag
* @return
*/
private String getContentNoTag(String contentWithTag) {
return ContentUtility.TransferHTML2Text(contentWithTag);
}
/**
* 去除html标签中的无效字符
* 创建人: 李东亮
* 创建时间: 2015-7-1 下午1:05:52
* @version 1.0
* @param html
* @return
*/
public String formatHtmlTag(String html) {
Pattern p = Pattern.compile("<[\\d|\\w|\\/]*([^(\\d|\\w|\\/)]+)[\\d|\\w|\\/]*>");
Matcher m = p.matcher(html);
String g;
while (m.find()) {
g = m.group();
html = html.replaceAll(g, g.replaceAll("[^(\\<|\\>|\\d|\\w|\\/)]+", ""));
}
return html;
}
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-4-7 下午2:00:19
* @version 1.0
* @param entity
* @return
* @throws Exception
*/
public void process(ExtEntity entity) throws Exception {
//获取字符集,并把html片段转换为正确的编码
/* Header header = getMethod.getResponseHeader("Content-Type");
String charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
if(!Constants.SHANGFEI_SUPPORT){
html = CharsetUtil.converCharsetToUTF8(html,charset);
charset=Constants.DEFAULT_CHARSET;
}
entity.setCharset(charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
entity.setHtml(html);
*/
//获取正文,标题,关键词,摘要,作者,发布时间,来源
Processor processor;
String ename;
String result;
String contentWithTag;
String temp;
for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
processor = iterator.next();
ename = processor.getExtType().getEname().toUpperCase();
//标题
if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setTitle(temp);
}
}
}
//作者
else if (ename.equals(EXT_TYPE.AUTHOR.toString())&&entity.getAuthor()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setAuthor(temp);
}
}
}
//发布时间
else if (ename.equals(EXT_TYPE.PUBLISH_DATE.toString())&&entity.getPublishDate()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setPublishDate(temp);
}
else{
if(html.contains("ar ct = \"")){
int ii=html.indexOf("var ct = \"");
temp=html.substring(html.indexOf("ar ct = \"")+9, html.indexOf("ar ct = \"")+19);
if (temp.length() > 0) {
String time= DateUtil.tiemString2String(temp+"000", true);
entity.setPublishDate(time);
}
}
}
}
}
//来源
else if (ename.equals(EXT_TYPE.ORIGIN.toString())&&entity.getOrigin()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setOrigin(temp);
}
}
}
//关键词
else if (ename.equals(EXT_TYPE.KEYWORDS.toString())&&entity.getKeywords()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setKeywords(temp);
}
}
}
//摘要
else if (ename.equals(EXT_TYPE.SUMMARY.toString())&&entity.getSummary()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setSummary(temp);
}
}
}
//带标签正文
else if (ename.equals(EXT_TYPE.CONTENT.toString())&&entity.getContentWithTag()==null) {
result = processor.extract(html);
if (result != null) {
contentWithTag = this.getContentWithTag(result);
//带标签正文
entity.setContentWithTag(contentWithTag);
}
}
}
}
public String getContent(){
return html;
}
public String getCharset(){
return charset;
}
}
package com.zzsn.extractor.web;
import java.util.ArrayList;
import java.util.List;
/**
* 默认返回对象设置
* 创建人:李东亮
* 创建时间:2015-5-13 上午10:23:02
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DefaultMsg {
private int success = 1;
private List<String> errors = new ArrayList<String>();
public int getSuccess() {
return success;
}
public void setSuccess(int success) {
this.success = success;
}
public List<String> getErrors() {
return errors;
}
public void setErrors(List<String> errors) {
this.errors = errors;
}
}
package com.zzsn.extractor.web;
import java.io.Serializable;
/**
* 抽取内容
* 创建人:李东亮
* 创建时间:2015-5-18 下午4:39:22
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ExtType implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
//抽取类型名称
private String ename;
//抽取类型表达式
private String exp;
//需要减去的标签
private String subtraction;
//此值不为空就取所选标签的attr属性值作为返回结果
private String attr;
public String getSubtraction() {
return subtraction;
}
public void setSubtraction(String subtraction) {
this.subtraction = subtraction;
}
public String getExp() {
return exp;
}
public void setExp(String exp) {
this.exp = exp;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getAttr() {
return attr;
}
public void setAttr(String attr) {
this.attr = attr;
}
}
package com.zzsn.extractor.web;
/**
* 正文抽取方法
* 创建人:李东亮
* 创建时间:2015-5-17 下午3:06:21
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public interface Processor {
/**
* 抽取正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午2:11:21
* @version 1.0
* @param html
* @return
*/
public String extract(String html);
/**
* 初始化
* 创建人: 李东亮
* 创建时间: 2015-5-29 下午3:38:11
* @version 1.0
* @return
*/
public DefaultMsg init();
/**
* 获取exttype
* 创建人: 李东亮
* 创建时间: 2015-5-29 下午3:36:17
* @version 1.0
* @return
*/
public ExtType getExtType();
}
package com.zzsn.extractor.web;
import com.zzsn.util.Constants;
import com.zzsn.util.FileUtil;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* xml操作工具类
* 创建人:李东亮
* 创建时间:2015-5-27 下午2:08:13
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ProcessorReader {
private static final Logger Log = LoggerFactory.getLogger(ProcessorReader.class);
private String path=this.getClass().getResource("/").getPath()+"conf/wechat-processor.templete";
/**
* 设置范围processor
* 创建人: 李东亮
* 创建时间: 2016-8-16 上午11:45:56
* @version 1.0
* @param scopeTags
* @return
*/
public static Processor getScopeProcessor(String scopeTags){
if(scopeTags==null||scopeTags.length()==0){
return null;
}
ExtType extType = new ExtType();
extType.setEname("SCOPE");
extType.setExp(scopeTags);
Processor processor = new JsoupTagProcessor(extType);
processor.init();
return processor;
}
/**
* 从模板中读取抽取类型和抽取公式
* 创建人: 李东亮
* 创建时间: 2015-5-27 下午3:14:08
* @version 1.0
* @param templete
* @return
*/
public static List<Processor> readProcessors(String templete){
Log.debug("===读取配置文件开始============");
List<Processor> processors = new ArrayList<Processor>();
SAXBuilder builder = new SAXBuilder();
org.jdom.Document doc = null;
try {
doc = builder.build( new StringReader(templete));
} catch (JDOMException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Element root = doc.getRootElement();
Processor processor;
ExtType extType;
Element extTypeEle;
Element subtractionEle;
Element attrEle;
String attr;
String ename;
for(WebExtractorImpl.EXT_TYPE extTypeStr : WebExtractorImpl.EXT_TYPE.values()){
ename = extTypeStr.toString().toLowerCase();
extTypeEle = root.getChild(ename);
if(extTypeEle==null){
continue;
}
extType = new ExtType();
extType.setEname(ename);
extType.setExp(extTypeEle.getChild("exp").getText().trim());
subtractionEle = extTypeEle.getChild("subtraction");
if(subtractionEle!=null){
extType.setSubtraction(subtractionEle.getText().trim());
}
attrEle = extTypeEle.getChild("attr");
if(attrEle!=null){
attr = attrEle.getText();
if(attr!=null){
attr = attr.trim();
if(attr.length()>0){
extType.setAttr(attr);
}
}
}
processor = new JsoupTagProcessor(extType);
processor.init();
Log.debug("=======");
Log.debug(extType.getEname()+":");
Log.debug("exp["+extType.getExp()+"]");
Log.debug("subtraction["+extType.getSubtraction()+"]");
Log.debug("attr["+extType.getAttr()+"]");
Log.debug("=======");
processors.add(processor);
}
Log.debug("===读取配置文件结束,共有"+processors.size()+"项需要爬取============");
return processors;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2016-5-23 下午3:24:32
* @version 1.0
* @return
*/
public static List<Processor> readWeChatProcessors(){
// String path = ProcessorReader.class.getClass().getResource("/").getPath()+"conf/wechat-processor.templete";
// String path=System.getProperty("user.dir")+"/conf/wechat-processor.templete";
// String templete = FileUtil.readFile(new File(path),"UTF-8");
String templete = FileUtil.readFile(new File(Constants.path),"UTF-8");
return readProcessors(templete);
}
public static void main(String[] args) {
}
}
package com.zzsn.extractor.web;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.Extractor;
import com.zzsn.util.CharsetUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.FileUtil;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正文抽取处理类.抽取标题,摘要,正文,作者,字符集
* 创建人:李东亮
* 创建时间:2015-5-11 下午3:28:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class WebExtractorImpl implements Extractor {
private static final Logger Log = LoggerFactory.getLogger(WebExtractorImpl.class);
//抽取类型
public enum EXT_TYPE {
CONTENT, TITLE, KEYWORDS, SUMMARY, AUTHOR, PUBLISH_DATE, ORIGIN
}
private List<Processor> processors;
private HttpResponse getMethod;
private String html;
private String charset;
public WebExtractorImpl(List<Processor> processors,HttpResponse getMethod){
this.processors = processors;
this.getMethod = getMethod;
}
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream) {
try{
html = FileUtil.readHtml(inputStream, Constants.READ_CHARSET);
Header header = getMethod.getFirstHeader("Content-Type");
charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
}catch(Exception e){
return false;
}
if(html==null){
return false;
}else{
return true;
}
}
/**
* 获取带标签正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:04:01
* @version 1.0
* @param body
* @return
*/
private String getContentWithTag(String body) {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(body);
return contentWithTag;
}
/**
* 获取不带标签的正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:06:32
* @version 1.0
* @param contentWithTag
* @return
*/
private String getContentNoTag(String contentWithTag) {
return ContentUtility.TransferHTML2Text(contentWithTag);
}
/**
* 去除html标签中的无效字符
* 创建人: 李东亮
* 创建时间: 2015-7-1 下午1:05:52
* @version 1.0
* @param html
* @return
*/
public String formatHtmlTag(String html) {
Pattern p = Pattern.compile("<[\\d|\\w|\\/]*([^(\\d|\\w|\\/)]+)[\\d|\\w|\\/]*>");
Matcher m = p.matcher(html);
String g;
while (m.find()) {
g = m.group();
html = html.replaceAll(g, g.replaceAll("[^(\\<|\\>|\\d|\\w|\\/)]+", ""));
}
return html;
}
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-4-7 下午2:00:19
* @version 1.0
* @param curi
* @return
* @throws Exception
*/
public void process(ExtEntity entity) throws Exception {
//获取字符集,并把html片段转换为正确的编码
/* Header header = getMethod.getResponseHeader("Content-Type");
String charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
if(!Constants.SHANGFEI_SUPPORT){
html = CharsetUtil.converCharsetToUTF8(html,charset);
charset=Constants.DEFAULT_CHARSET;
}
entity.setCharset(charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
entity.setHtml(html);
*/
//获取正文,标题,关键词,摘要,作者,发布时间,来源
Processor processor;
String ename;
String result;
String contentWithTag;
String temp;
for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
processor = iterator.next();
ename = processor.getExtType().getEname().toUpperCase();
//标题
if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setTitle(temp);
}
}
}
//作者
else if (ename.equals(EXT_TYPE.AUTHOR.toString())&&entity.getAuthor()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setAuthor(temp);
}
}
}
//发布时间
else if (ename.equals(EXT_TYPE.PUBLISH_DATE.toString())&&entity.getPublishDate()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setPublishDate(temp);
}
}
}
//来源
else if (ename.equals(EXT_TYPE.ORIGIN.toString())&&entity.getOrigin()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setOrigin(temp);
}
}
}
//关键词
else if (ename.equals(EXT_TYPE.KEYWORDS.toString())&&entity.getKeywords()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setKeywords(temp);
}
}
}
//摘要
else if (ename.equals(EXT_TYPE.SUMMARY.toString())&&entity.getSummary()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setSummary(temp);
}
}
}
//带标签正文
else if (ename.equals(EXT_TYPE.CONTENT.toString())&&entity.getContentWithTag()==null) {
result = processor.extract(html);
if (result != null) {
contentWithTag = this.getContentWithTag(result);
//带标签正文
entity.setContentWithTag(contentWithTag);
}
}
}
}
public String getContent(){
return html;
}
public String getCharset(){
return charset;
}
}
<?xml version="1.0" encoding="UTF-8"?>
<template><content><exp>*.div[id="js_content"]</exp></content><title><exp>*.h2[class="rich_media_title"]</exp></title><author><exp>*.a[id="js_name"]</exp></author><publish_date><exp>*.em[id="post-date"]</exp></publish_date></template>
package com.zzsn.job;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* 阻塞线程池
* 线程池的线程数到达最大线程数阻塞等待
* 可用于多线程获取MQ消息任务
* 因为会阻塞,就不用考虑拒绝策略这一块的重写
*/
public class BlockThreadPoolExecute extends ThreadPoolExecutor {
private ReentrantLock lock = new ReentrantLock();
private Condition condition = this.lock.newCondition();
public BlockThreadPoolExecute(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
@Override
public void execute(Runnable command) {
//进行同步锁定
this.lock.lock();
super.execute(command);
try {
//如果线程池的数量已经达到最大线程池的数量,则进行挂起操作
if (getPoolSize() == getMaximumPoolSize()) {
this.condition.await();
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
this.lock.unlock();
}
}
@Override
protected void afterExecute(Runnable r, Throwable t) {
try{
lock.lock();
this.condition.signal();
}finally {
this.lock.unlock();
}
}
}
\ No newline at end of file
package com.zzsn.job;
import com.zzsn.util.Constants;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.List;
import java.util.Set;
public class JedisUtil {
private static final String PREFIX = "weixin_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
private JedisUtil() {
}
/**
* 从jedis连接池中获取获取jedis对象
*
* @return
*/
private static void init(){
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXTOTAL;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
private static Jedis getJedis() {
init();
return jedisPool.getResource();
}
private static final JedisUtil jedisUtil = new JedisUtil();
/**
* 获取JedisUtil实例
*
* @return
*/
public static JedisUtil getInstance() {
return jedisUtil;
}
public static void returnResource(final Jedis jedis) {
if (jedis != null && jedisPool != null) {
jedis.close();
}
}
public static Jedis getDefaultJedis() {
// return getJedis(HOST_IP, HOST_PORT);//简装版
return getJedis();
}
/**
* 根据 pattern 获取 redis 中的键
*/
public static Set<String> getKeysByPattern(String pattern) {
return getDefaultJedis().keys(pattern);
}
public static boolean exists(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
throw new Exception("key is null");
}
return getDefaultJedis().exists(PREFIX + key);
}
public static void del(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(PREFIX + key);
}
public static void setString(String key, String value, int expireTime) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
String finalKey = PREFIX + key;
getDefaultJedis().set(finalKey, value);
if (expireTime > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
getDefaultJedis().expire(finalKey, expireTime);
}else{
getDefaultJedis().expire(finalKey, 60*60*24*15);
}
}
public static String getString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(PREFIX + key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().setnx(PREFIX + key, value);
}
public static long expire(String key, int seconds) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().expire(PREFIX + key, seconds);
}
public static void pushList(String key, String value, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
/**
* key代表的是链表的名字 List是一个双端链表,lpush是往链表的头部插入一条数据,rpush是往尾部插入一条数据
*/
if (flag.equalsIgnoreCase("L")) {
getDefaultJedis().lpush(PREFIX + key, value);
} else if (flag.equalsIgnoreCase("R")) {
getDefaultJedis().rpush(PREFIX + key, value);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
public static String popList(String key, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
if (flag.equalsIgnoreCase("L")) {
return getDefaultJedis().lpop(PREFIX + key);
} else if (flag.equalsIgnoreCase("R")) {
return getDefaultJedis().rpop(PREFIX + key);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
/**
* 获取 List 中指定区间上的元素
*/
public static List<String> getAppointedList(String key, long start, long end) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, start, end);
}
public static List<String> getList(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, 0, -1);
}
public static void sadd(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().sadd(key,value);
}
public static boolean sismember(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().sismember(key,value);
}
}
package com.zzsn.job;
import com.alibaba.fastjson.JSON;
import com.zzsn.crawler.WeixinDetailThread;
import com.zzsn.crawler.WeixinSiteThread;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.util.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Arrays;
import java.util.Properties;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@Component
@EnableScheduling
@Slf4j
public class KafkaConsumerJob {
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
/**
* 从kafka中获取公众号信息进行发送获取列表内容提取链接
*/
@Scheduled(cron = "0 0/3 * * * ?")
@Async("asyncTaskExecutor")
public void wxOfficialConsumer (){
ExecutorService threadPool = Executors.newSingleThreadExecutor();
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class);
WeixinSiteThread siteThread=new WeixinSiteThread();
siteThread.siteMsgTemple=siteMsgTemple;
//创建使用固定线程数的线程池
threadPool.execute(siteThread);
TimeUnit.SECONDS.sleep(20);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
threadPool.shutdown();
while(true)
{
boolean isfinished = threadPool.isTerminated();
if(isfinished)
break;
}
}
ExecutorService serviceDetail = Executors.newFixedThreadPool(1);
/**
* 从kafka中获取微信资讯url进行解析详情
*/
// @Scheduled(cron = "0 0/2 * * * ?")
public void wxDetailconsumer (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_WXDETAILURL_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class);
WeixinDetailThread siteThread=new WeixinDetailThread();
siteThread.siteMsgTemple=siteMsgTemple;
// siteThread.start();
//创建使用固定线程数的线程池
serviceDetail.execute(()->{
String threadName= Thread.currentThread().getName();
System.out.println(threadName+"开始执行");
try {
siteThread.start();
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println(threadName+"执行结束");
});
TimeUnit.SECONDS.sleep(10);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_WXDETAILURL_TOPIC));
}
}
private static KafkaConsumer<String, String> create2Consumer() {
Properties props = new Properties();
// 必须设置的属性
props.put("bootstrap.servers", "114.115.159.144:9092");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("group.id", "group1");
// 可选设置属性
props.put("enable.auto.commit", "true");
// 自动提交offset,每1s提交一次
props.put("auto.commit.interval.ms", "1000");
props.put("auto.offset.reset","earliest ");
props.put("client.id", "es-sync");
return new KafkaConsumer<>(props);
}
}
package com.zzsn.job;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.StringSerializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.core.ProducerFactory;
import java.util.HashMap;
import java.util.Map;
/**
* kafka生产者配置
* @author Java小白
*
*/
@Configuration
@EnableKafka
public class KafkaProducerConfig {
@Value("${kafka.producer.servers}")
private String servers;
@Value("${kafka.producer.retries}")
private int retries;
@Value("${kafka.producer.batch.size}")
private int batchSize;
@Value("${kafka.producer.linger}")
private int linger;
@Value("${kafka.producer.buffer.memory}")
private int bufferMemory;
/**
* 配置生产者信息(消费提供者信息)
* @return
*/
public Map<String, Object> producerConfigs() {
Map<String, Object> props = new HashMap<>();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, servers);
props.put(ProducerConfig.RETRIES_CONFIG, retries);
props.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize);
props.put(ProducerConfig.LINGER_MS_CONFIG, linger);
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, bufferMemory);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.MAX_REQUEST_SIZE_CONFIG,"10485760");
return props;
}
/**
* 消费工厂
* @return
*/
public ProducerFactory<String, String> producerFactory() {
return new DefaultKafkaProducerFactory<>(producerConfigs());
}
/**
* 消息发送工具类
* @return
*/
@Bean
public KafkaTemplate<String, String> kafkaTemplate() {
//需要指定消费工厂
return new KafkaTemplate<String, String>(producerFactory());
}
}
\ No newline at end of file
package com.zzsn.job;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertyUtil {
//加载property文件到io流里面
public static Properties loadProperties(String propertyFile) {
Properties properties = new Properties();
try {
InputStream is = PropertyUtil.class.getClassLoader().getResourceAsStream(propertyFile);
if(is == null){
is = PropertyUtil.class.getClassLoader().getResourceAsStream("properties/" + propertyFile);
}
properties.load(is);
} catch (IOException e) {
e.printStackTrace();
}
return properties;
}
/**
* 根据key值取得对应的value值
*
* @param key
* @return
*/
public static String getValue(String propertyFile, String key) {
Properties properties = loadProperties(propertyFile);
return properties.getProperty(key);
}
}
package com.zzsn.job;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Properties;
import java.util.Set;
public class RediscachedFactory {
private static final Logger Log = LoggerFactory.getLogger(RediscachedFactory.class);
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
/**
* 加载spring容器
* 创建人: 李东亮 MemcachedFactory
* 创建时间: 2015-5-30 上午11:39:31
* @version 1.0
*/
public static void init(){
Properties properties = PropertyUtil.loadProperties("conf/redis.properties");
String host = properties.getProperty("redis.host");
String port = properties.getProperty("redis.port");
String pass = properties.getProperty("redis.pass");
String timeout = properties.getProperty("redis.timeout");
String maxIdle = properties.getProperty("redis.maxIdle");
String maxTotal = properties.getProperty("redis.maxTotal");
String maxWaitMillis = properties.getProperty("redis.maxWaitMillis");
String testOnBorrow = properties.getProperty("redis.testOnBorrow");
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
public static String getKeyStr(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
return null;
}
return jedis.get( key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
private static Jedis getJedis() {
return jedisPool.getResource();
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeyStr(String key, String value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static boolean setKeyStrtime(String key, String value,Integer time) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
if (time > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(key, time);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static Set<String> getKeySet(String key) {
Jedis jedis=getJedis();
try {
Set<String> obj = jedis.smembers(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeySet(String key, Set<String> value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if(value==null){
return false;
}
jedis.sadd(key, value.toArray(new String[value.size()]));
} catch (Exception e){
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static void del(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
return;
}
jedis.del(key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
}
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论