提交 5880af84 作者: liuweigang

微信修改项目提交

上级 5131584c
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<component name="ArtifactManager">
<artifact type="exploded-war" name="weixinCrawler:war exploded">
<output-path>$PROJECT_DIR$/out/artifacts/weixinCrawler_war_exploded</output-path>
<root id="root">
<element id="directory" name="WEB-INF">
<element id="directory" name="classes">
<element id="module-output" name="weixinCrawler" />
</element>
<element id="directory" name="lib">
<element id="library" level="module" name="Maven: jackson-all:jackson-all:1.7.6" module-name="weixinCrawler" />
<element id="library" level="module" name="Maven: jedis:jedis:3.0.1" module-name="weixinCrawler" />
<element id="library" level="project" name="Maven: org.jdom:jdom:1.1" />
<element id="library" level="project" name="Maven: io.protostuff:protostuff-core:1.6.0" />
<element id="library" level="project" name="Maven: io.protostuff:protostuff-api:1.6.0" />
<element id="library" level="project" name="Maven: io.protostuff:protostuff-runtime:1.6.0" />
<element id="library" level="project" name="Maven: io.protostuff:protostuff-collectionschema:1.6.0" />
<element id="library" level="project" name="Maven: org.apache.httpcomponents:httpcore:4.4.10" />
<element id="library" level="project" name="Maven: org.apache.httpcomponents:httpclient:4.5.6" />
<element id="library" level="project" name="Maven: commons-codec:commons-codec:1.13" />
<element id="library" level="project" name="Maven: com.squareup.okhttp3:okhttp:3.3.1" />
<element id="library" level="project" name="Maven: com.squareup.okio:okio:1.8.0" />
<element id="library" level="project" name="Maven: com.google.protobuf:protobuf-java:3.2.0" />
<element id="library" level="project" name="Maven: com.burgstaller:okhttp-digest:1.15" />
<element id="library" level="project" name="Maven: com.baomidou:mybatis-plus-boot-starter:3.4.1" />
<element id="library" level="project" name="Maven: com.baomidou:mybatis-plus:3.4.1" />
<element id="library" level="project" name="Maven: com.baomidou:mybatis-plus-extension:3.4.1" />
<element id="library" level="project" name="Maven: com.baomidou:mybatis-plus-core:3.4.1" />
<element id="library" level="project" name="Maven: com.baomidou:mybatis-plus-annotation:3.4.1" />
<element id="library" level="project" name="Maven: com.github.jsqlparser:jsqlparser:3.2" />
<element id="library" level="project" name="Maven: org.mybatis:mybatis:3.5.6" />
<element id="library" level="project" name="Maven: org.mybatis:mybatis-spring:2.0.5" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-autoconfigure:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-jdbc:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: com.zaxxer:HikariCP:3.4.1" />
<element id="library" level="project" name="Maven: org.springframework:spring-jdbc:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: com.alibaba:druid:1.0.5" />
<element id="library" level="project" name="Maven: mysql:mysql-connector-java:8.0.18" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-data-redis:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.data:spring-data-redis:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.data:spring-data-keyvalue:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.data:spring-data-commons:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-oxm:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-aop:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: io.lettuce:lettuce-core:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: io.netty:netty-common:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.netty:netty-handler:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.netty:netty-buffer:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.netty:netty-codec:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.netty:netty-transport:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.netty:netty-resolver:4.1.42.Final" />
<element id="library" level="project" name="Maven: io.projectreactor:reactor-core:3.3.0.RELEASE" />
<element id="library" level="project" name="Maven: org.reactivestreams:reactive-streams:1.0.3" />
<element id="library" level="project" name="Maven: com.alibaba:fastjson:1.2.13" />
<element id="library" level="project" name="Maven: org.springframework.kafka:spring-kafka:2.8.4" />
<element id="library" level="project" name="Maven: org.springframework:spring-context:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-expression:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-messaging:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-tx:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.retry:spring-retry:1.2.4.RELEASE" />
<element id="library" level="project" name="Maven: org.apache.kafka:kafka-clients:2.3.0" />
<element id="library" level="project" name="Maven: com.github.luben:zstd-jni:1.4.0-1" />
<element id="library" level="project" name="Maven: org.lz4:lz4-java:1.6.0" />
<element id="library" level="project" name="Maven: org.xerial.snappy:snappy-java:1.1.7.3" />
<element id="library" level="project" name="Maven: com.google.code.findbugs:jsr305:3.0.2" />
<element id="library" level="project" name="Maven: com.jayway.jsonpath:json-path:2.4.0" />
<element id="library" level="project" name="Maven: net.minidev:json-smart:2.3" />
<element id="library" level="project" name="Maven: net.minidev:accessors-smart:1.2" />
<element id="library" level="project" name="Maven: org.ow2.asm:asm:5.0.4" />
<element id="library" level="project" name="Maven: org.slf4j:slf4j-api:1.7.28" />
<element id="library" level="project" name="Maven: org.jsoup:jsoup:1.14.2" />
<element id="library" level="project" name="Maven: com.whalin:Memcached-Java-Client:3.0.2" />
<element id="library" level="project" name="Maven: commons-pool:commons-pool:1.6" />
<element id="library" level="project" name="Maven: net.spy:spymemcached:2.12.2" />
<element id="library" level="project" name="Maven: com.googlecode.xmemcached:xmemcached:2.4.7" />
<element id="library" level="project" name="Maven: cn.wanghaomiao:JsoupXpath:2.5.1" />
<element id="library" level="project" name="Maven: org.antlr:antlr4-runtime:4.7.2" />
<element id="library" level="project" name="Maven: org.apache.commons:commons-pool2:2.7.0" />
<element id="library" level="project" name="Maven: org.apache.commons:commons-lang3:3.7" />
<element id="library" level="project" name="Maven: org.quartz-scheduler:quartz:2.2.1" />
<element id="library" level="project" name="Maven: c3p0:c3p0:0.9.1.1" />
<element id="library" level="project" name="Maven: org.springframework:spring-context-support:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-beans:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-core:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-jcl:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: cn.hutool:hutool-all:5.3.8" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-logging:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: ch.qos.logback:logback-classic:1.2.3" />
<element id="library" level="project" name="Maven: ch.qos.logback:logback-core:1.2.3" />
<element id="library" level="project" name="Maven: org.apache.logging.log4j:log4j-to-slf4j:2.12.1" />
<element id="library" level="project" name="Maven: org.apache.logging.log4j:log4j-api:2.12.1" />
<element id="library" level="project" name="Maven: org.slf4j:jul-to-slf4j:1.7.28" />
<element id="library" level="project" name="Maven: jakarta.annotation:jakarta.annotation-api:1.3.5" />
<element id="library" level="project" name="Maven: org.yaml:snakeyaml:1.25" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-devtools:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.projectlombok:lombok:1.18.10" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-web:2.6.5" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-json:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.core:jackson-databind:2.10.0" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.core:jackson-annotations:2.10.0" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.core:jackson-core:2.10.0" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.10.0" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.10.0" />
<element id="library" level="project" name="Maven: com.fasterxml.jackson.module:jackson-module-parameter-names:2.10.0" />
<element id="library" level="project" name="Maven: org.springframework.boot:spring-boot-starter-tomcat:2.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.apache.tomcat.embed:tomcat-embed-el:9.0.27" />
<element id="library" level="project" name="Maven: org.apache.tomcat.embed:tomcat-embed-websocket:9.0.27" />
<element id="library" level="project" name="Maven: org.springframework:spring-web:5.2.0.RELEASE" />
<element id="library" level="project" name="Maven: org.springframework:spring-webmvc:5.2.0.RELEASE" />
</element>
</element>
<element id="javaee-facet-resources" facet="weixinCrawler/web/Web" />
</root>
</artifact>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile default="true" name="Default" enabled="true" />
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="crawler_2022" />
<module name="weixinCrawler" />
</profile>
</annotationProcessing>
</component>
<component name="JavacSettings">
<option name="ADDITIONAL_OPTIONS_OVERRIDE">
<module name="weixinCrawler" options="-parameters" />
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" defaultCharsetForPropertiesFiles="UTF-8">
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/awx/controller/WeixinController.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/awx/service/SiteService.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/entity/SiteMsgRecord.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/test/Test.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/java/com/zzsn/util/WeixinUtil.java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/weixinCrawler/src/main/resources/constants.properties" charset="UTF-8" />
<file url="PROJECT" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="http://maven.aliyun.com/nexus/content/groups/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<component name="libraryTable">
<library name="mysql-connector-java-5.1.7-bin">
<CLASSES>
<root url="jar://$PROJECT_DIR$/weixinCrawler/lib/mysql-connector-java-5.1.7-bin.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />
</library>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="FrameworkDetectionExcludesConfiguration">
<file type="web" url="file://$PROJECT_DIR$" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
<option value="$PROJECT_DIR$/weixinCrawler/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_1_8" default="true" project-jdk-name="1.8" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
<component name="ProjectType">
<option name="id" value="jpab" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RunConfigurationProducerService">
<option name="ignoredProducers">
<set>
<option value="com.android.tools.idea.compose.preview.runconfiguration.ComposePreviewRunConfigurationProducer" />
</set>
</option>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Palette2">
<group name="Swing">
<item class="com.intellij.uiDesigner.HSpacer" tooltip-text="Horizontal Spacer" icon="/com/intellij/uiDesigner/icons/hspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="1" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="com.intellij.uiDesigner.VSpacer" tooltip-text="Vertical Spacer" icon="/com/intellij/uiDesigner/icons/vspacer.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="1" anchor="0" fill="2" />
</item>
<item class="javax.swing.JPanel" icon="/com/intellij/uiDesigner/icons/panel.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3" />
</item>
<item class="javax.swing.JScrollPane" icon="/com/intellij/uiDesigner/icons/scrollPane.png" removable="false" auto-create-binding="false" can-attach-label="true">
<default-constraints vsize-policy="7" hsize-policy="7" anchor="0" fill="3" />
</item>
<item class="javax.swing.JButton" icon="/com/intellij/uiDesigner/icons/button.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="0" fill="1" />
<initial-values>
<property name="text" value="Button" />
</initial-values>
</item>
<item class="javax.swing.JRadioButton" icon="/com/intellij/uiDesigner/icons/radioButton.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="RadioButton" />
</initial-values>
</item>
<item class="javax.swing.JCheckBox" icon="/com/intellij/uiDesigner/icons/checkBox.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="3" anchor="8" fill="0" />
<initial-values>
<property name="text" value="CheckBox" />
</initial-values>
</item>
<item class="javax.swing.JLabel" icon="/com/intellij/uiDesigner/icons/label.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="8" fill="0" />
<initial-values>
<property name="text" value="Label" />
</initial-values>
</item>
<item class="javax.swing.JTextField" icon="/com/intellij/uiDesigner/icons/textField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JPasswordField" icon="/com/intellij/uiDesigner/icons/passwordField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JFormattedTextField" icon="/com/intellij/uiDesigner/icons/formattedTextField.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1">
<preferred-size width="150" height="-1" />
</default-constraints>
</item>
<item class="javax.swing.JTextArea" icon="/com/intellij/uiDesigner/icons/textArea.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTextPane" icon="/com/intellij/uiDesigner/icons/textPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JEditorPane" icon="/com/intellij/uiDesigner/icons/editorPane.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JComboBox" icon="/com/intellij/uiDesigner/icons/comboBox.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="2" anchor="8" fill="1" />
</item>
<item class="javax.swing.JTable" icon="/com/intellij/uiDesigner/icons/table.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JList" icon="/com/intellij/uiDesigner/icons/list.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="2" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTree" icon="/com/intellij/uiDesigner/icons/tree.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3">
<preferred-size width="150" height="50" />
</default-constraints>
</item>
<item class="javax.swing.JTabbedPane" icon="/com/intellij/uiDesigner/icons/tabbedPane.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSplitPane" icon="/com/intellij/uiDesigner/icons/splitPane.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="3" hsize-policy="3" anchor="0" fill="3">
<preferred-size width="200" height="200" />
</default-constraints>
</item>
<item class="javax.swing.JSpinner" icon="/com/intellij/uiDesigner/icons/spinner.png" removable="false" auto-create-binding="true" can-attach-label="true">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSlider" icon="/com/intellij/uiDesigner/icons/slider.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="8" fill="1" />
</item>
<item class="javax.swing.JSeparator" icon="/com/intellij/uiDesigner/icons/separator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="6" anchor="0" fill="3" />
</item>
<item class="javax.swing.JProgressBar" icon="/com/intellij/uiDesigner/icons/progressbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1" />
</item>
<item class="javax.swing.JToolBar" icon="/com/intellij/uiDesigner/icons/toolbar.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="6" anchor="0" fill="1">
<preferred-size width="-1" height="20" />
</default-constraints>
</item>
<item class="javax.swing.JToolBar$Separator" icon="/com/intellij/uiDesigner/icons/toolbarSeparator.png" removable="false" auto-create-binding="false" can-attach-label="false">
<default-constraints vsize-policy="0" hsize-policy="0" anchor="0" fill="1" />
</item>
<item class="javax.swing.JScrollBar" icon="/com/intellij/uiDesigner/icons/scrollbar.png" removable="false" auto-create-binding="true" can-attach-label="false">
<default-constraints vsize-policy="6" hsize-policy="0" anchor="0" fill="2" />
</item>
</group>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zzsn</groupId>
<artifactId>crawler_2022</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>war</packaging>
<name>crawler_2022</name>
<!-- FIXME change it to the project's website -->
<url>http://www.zzsn.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<!-- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>-->
</dependencies>
<build>
<finalName>crawler_2022</finalName>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_war_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-war-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar
微信公众号爬取实现原理:
1.配置公众号对应的信息
2.使用代码与企业微信建立连接,并将对应的公众号链接通过代码发送到对应的微信号上。
3.使用自动化软件触发手机微信接收消息点击访问的步骤,使用电脑fiddler抓取对应的页面信息。
4.使用代码从页面中提取微信公众号中的信息链接。
5.对抽取到的信息链接进行访问和信息抽取。
微信爬虫流程:
1.从kafka获取微信公众号信息。
2.从链接中获取微信公众号id。
3.使用代码模拟企业微信并将信息发送给相关的微信号。
4.手机微信号接收到公众号的链接信息。
5.使用免root自动化助手创建自动化脚本模拟点击手机微信消息操作。
6.使用fiddler抓取微信点击微信公众号的页面,并通过fiddler的脚本将抓取的页面信息发送给程序代码,
7.代码实现对页面信息的抽取解析获取对应的资讯链接地址,并发送到kafka中。
8.接收kafka中的资讯链接地址并对相关的内容信息进行抽取,再发送到相应的kafka的topic中
<?xml version="1.0" encoding="UTF-8"?>
<web-ext-pme
xmlns="http://websphere.ibm.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://websphere.ibm.com/xml/ns/javaee http://websphere.ibm.com/xml/ns/javaee/ibm-web-ext-pme_1_0.xsd"
version="1.0">
</web-ext-pme>
#!/bin/sh
# ----------------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Maven Start Up Batch script
#
# Required ENV vars:
# ------------------
# JAVA_HOME - location of a JDK home dir
#
# Optional ENV vars
# -----------------
# M2_HOME - location of maven2's installed home dir
# MAVEN_OPTS - parameters passed to the Java VM when running Maven
# e.g. to debug Maven itself, use
# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
# MAVEN_SKIP_RC - flag to disable loading of mavenrc files
# ----------------------------------------------------------------------------
if [ -z "$MAVEN_SKIP_RC" ] ; then
if [ -f /usr/local/etc/mavenrc ] ; then
. /usr/local/etc/mavenrc
fi
if [ -f /etc/mavenrc ] ; then
. /etc/mavenrc
fi
if [ -f "$HOME/.mavenrc" ] ; then
. "$HOME/.mavenrc"
fi
fi
# OS specific support. $var _must_ be set to either true or false.
cygwin=false;
darwin=false;
mingw=false
case "`uname`" in
CYGWIN*) cygwin=true ;;
MINGW*) mingw=true;;
Darwin*) darwin=true
# Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
# See https://developer.apple.com/library/mac/qa/qa1170/_index.html
if [ -z "$JAVA_HOME" ]; then
if [ -x "/usr/libexec/java_home" ]; then
export JAVA_HOME="`/usr/libexec/java_home`"
else
export JAVA_HOME="/Library/Java/Home"
fi
fi
;;
esac
if [ -z "$JAVA_HOME" ] ; then
if [ -r /etc/gentoo-release ] ; then
JAVA_HOME=`java-config --jre-home`
fi
fi
if [ -z "$M2_HOME" ] ; then
## resolve links - $0 may be a link to maven's home
PRG="$0"
# need this for relative symlinks
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG="`dirname "$PRG"`/$link"
fi
done
saveddir=`pwd`
M2_HOME=`dirname "$PRG"`/..
# make it fully qualified
M2_HOME=`cd "$M2_HOME" && pwd`
cd "$saveddir"
# echo Using m2 at $M2_HOME
fi
# For Cygwin, ensure paths are in UNIX format before anything is touched
if $cygwin ; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --unix "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
fi
# For Mingw, ensure paths are in UNIX format before anything is touched
if $mingw ; then
[ -n "$M2_HOME" ] &&
M2_HOME="`(cd "$M2_HOME"; pwd)`"
[ -n "$JAVA_HOME" ] &&
JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
fi
if [ -z "$JAVA_HOME" ]; then
javaExecutable="`which javac`"
if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
# readlink(1) is not available as standard on Solaris 10.
readLink=`which readlink`
if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
if $darwin ; then
javaHome="`dirname \"$javaExecutable\"`"
javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
else
javaExecutable="`readlink -f \"$javaExecutable\"`"
fi
javaHome="`dirname \"$javaExecutable\"`"
javaHome=`expr "$javaHome" : '\(.*\)/bin'`
JAVA_HOME="$javaHome"
export JAVA_HOME
fi
fi
fi
if [ -z "$JAVACMD" ] ; then
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
else
JAVACMD="`\\unset -f command; \\command -v java`"
fi
fi
if [ ! -x "$JAVACMD" ] ; then
echo "Error: JAVA_HOME is not defined correctly." >&2
echo " We cannot execute $JAVACMD" >&2
exit 1
fi
if [ -z "$JAVA_HOME" ] ; then
echo "Warning: JAVA_HOME environment variable is not set."
fi
CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
# traverses directory structure from process work directory to filesystem root
# first directory with .mvn subdirectory is considered project base directory
find_maven_basedir() {
if [ -z "$1" ]
then
echo "Path not specified to find_maven_basedir"
return 1
fi
basedir="$1"
wdir="$1"
while [ "$wdir" != '/' ] ; do
if [ -d "$wdir"/.mvn ] ; then
basedir=$wdir
break
fi
# workaround for JBEAP-8937 (on Solaris 10/Sparc)
if [ -d "${wdir}" ]; then
wdir=`cd "$wdir/.."; pwd`
fi
# end of workaround
done
echo "${basedir}"
}
# concatenates all lines of a file
concat_lines() {
if [ -f "$1" ]; then
echo "$(tr -s '\n' ' ' < "$1")"
fi
}
BASE_DIR=`find_maven_basedir "$(pwd)"`
if [ -z "$BASE_DIR" ]; then
exit 1;
fi
##########################################################################################
# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
# This allows using the maven wrapper in projects that prohibit checking in binary data.
##########################################################################################
if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then
if [ "$MVNW_VERBOSE" = true ]; then
echo "Found .mvn/wrapper/maven-wrapper.jar"
fi
else
if [ "$MVNW_VERBOSE" = true ]; then
echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..."
fi
if [ -n "$MVNW_REPOURL" ]; then
jarUrl="$MVNW_REPOURL/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
else
jarUrl="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
fi
while IFS="=" read key value; do
case "$key" in (wrapperUrl) jarUrl="$value"; break ;;
esac
done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties"
if [ "$MVNW_VERBOSE" = true ]; then
echo "Downloading from: $jarUrl"
fi
wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar"
if $cygwin; then
wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"`
fi
if command -v wget > /dev/null; then
if [ "$MVNW_VERBOSE" = true ]; then
echo "Found wget ... using wget"
fi
if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
wget "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath"
else
wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath"
fi
elif command -v curl > /dev/null; then
if [ "$MVNW_VERBOSE" = true ]; then
echo "Found curl ... using curl"
fi
if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then
curl -o "$wrapperJarPath" "$jarUrl" -f
else
curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f
fi
else
if [ "$MVNW_VERBOSE" = true ]; then
echo "Falling back to using Java to download"
fi
javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java"
# For Cygwin, switch paths to Windows format before running javac
if $cygwin; then
javaClass=`cygpath --path --windows "$javaClass"`
fi
if [ -e "$javaClass" ]; then
if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
if [ "$MVNW_VERBOSE" = true ]; then
echo " - Compiling MavenWrapperDownloader.java ..."
fi
# Compiling the Java class
("$JAVA_HOME/bin/javac" "$javaClass")
fi
if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
# Running the downloader
if [ "$MVNW_VERBOSE" = true ]; then
echo " - Running MavenWrapperDownloader.java ..."
fi
("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR")
fi
fi
fi
fi
##########################################################################################
# End of extension
##########################################################################################
export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}
if [ "$MVNW_VERBOSE" = true ]; then
echo $MAVEN_PROJECTBASEDIR
fi
MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
# For Cygwin, switch paths to Windows format before running java
if $cygwin; then
[ -n "$M2_HOME" ] &&
M2_HOME=`cygpath --path --windows "$M2_HOME"`
[ -n "$JAVA_HOME" ] &&
JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
[ -n "$CLASSPATH" ] &&
CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
[ -n "$MAVEN_PROJECTBASEDIR" ] &&
MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"`
fi
# Provide a "standardized" way to retrieve the CLI args that will
# work with both Windows and non-Windows executions.
MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@"
export MAVEN_CMD_LINE_ARGS
WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
exec "$JAVACMD" \
$MAVEN_OPTS \
$MAVEN_DEBUG_OPTS \
-classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
"-Dmaven.home=${M2_HOME}" \
"-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"
@REM ----------------------------------------------------------------------------
@REM Licensed to the Apache Software Foundation (ASF) under one
@REM or more contributor license agreements. See the NOTICE file
@REM distributed with this work for additional information
@REM regarding copyright ownership. The ASF licenses this file
@REM to you under the Apache License, Version 2.0 (the
@REM "License"); you may not use this file except in compliance
@REM with the License. You may obtain a copy of the License at
@REM
@REM https://www.apache.org/licenses/LICENSE-2.0
@REM
@REM Unless required by applicable law or agreed to in writing,
@REM software distributed under the License is distributed on an
@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@REM KIND, either express or implied. See the License for the
@REM specific language governing permissions and limitations
@REM under the License.
@REM ----------------------------------------------------------------------------
@REM ----------------------------------------------------------------------------
@REM Maven Start Up Batch script
@REM
@REM Required ENV vars:
@REM JAVA_HOME - location of a JDK home dir
@REM
@REM Optional ENV vars
@REM M2_HOME - location of maven2's installed home dir
@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending
@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
@REM e.g. to debug Maven itself, use
@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
@REM ----------------------------------------------------------------------------
@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
@echo off
@REM set title of command window
title %0
@REM enable echoing by setting MAVEN_BATCH_ECHO to 'on'
@if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
@REM set %HOME% to equivalent of $HOME
if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
@REM Execute a user defined script before this one
if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
@REM check for pre script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %*
if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %*
:skipRcPre
@setlocal
set ERROR_CODE=0
@REM To isolate internal variables from possible post scripts, we use another setlocal
@setlocal
@REM ==== START VALIDATION ====
if not "%JAVA_HOME%" == "" goto OkJHome
echo.
echo Error: JAVA_HOME not found in your environment. >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
:OkJHome
if exist "%JAVA_HOME%\bin\java.exe" goto init
echo.
echo Error: JAVA_HOME is set to an invalid directory. >&2
echo JAVA_HOME = "%JAVA_HOME%" >&2
echo Please set the JAVA_HOME variable in your environment to match the >&2
echo location of your Java installation. >&2
echo.
goto error
@REM ==== END VALIDATION ====
:init
@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
@REM Fallback to current working directory if not found.
set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
set EXEC_DIR=%CD%
set WDIR=%EXEC_DIR%
:findBaseDir
IF EXIST "%WDIR%"\.mvn goto baseDirFound
cd ..
IF "%WDIR%"=="%CD%" goto baseDirNotFound
set WDIR=%CD%
goto findBaseDir
:baseDirFound
set MAVEN_PROJECTBASEDIR=%WDIR%
cd "%EXEC_DIR%"
goto endDetectBaseDir
:baseDirNotFound
set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
cd "%EXEC_DIR%"
:endDetectBaseDir
IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
@setlocal EnableExtensions EnableDelayedExpansion
for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
:endReadAdditionalConfig
SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO (
IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B
)
@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
if exist %WRAPPER_JAR% (
if "%MVNW_VERBOSE%" == "true" (
echo Found %WRAPPER_JAR%
)
) else (
if not "%MVNW_REPOURL%" == "" (
SET DOWNLOAD_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar"
)
if "%MVNW_VERBOSE%" == "true" (
echo Couldn't find %WRAPPER_JAR%, downloading it ...
echo Downloading from: %DOWNLOAD_URL%
)
powershell -Command "&{"^
"$webclient = new-object System.Net.WebClient;"^
"if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^
"$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^
"}"^
"[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^
"}"
if "%MVNW_VERBOSE%" == "true" (
echo Finished downloading %WRAPPER_JAR%
)
)
@REM End of extension
@REM Provide a "standardized" way to retrieve the CLI args that will
@REM work with both Windows and non-Windows executions.
set MAVEN_CMD_LINE_ARGS=%*
%MAVEN_JAVA_EXE% ^
%JVM_CONFIG_MAVEN_PROPS% ^
%MAVEN_OPTS% ^
%MAVEN_DEBUG_OPTS% ^
-classpath %WRAPPER_JAR% ^
"-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^
%WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
if ERRORLEVEL 1 goto error
goto end
:error
set ERROR_CODE=1
:end
@endlocal & set ERROR_CODE=%ERROR_CODE%
if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost
@REM check for post script, once with legacy .bat ending and once with .cmd ending
if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat"
if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd"
:skipRcPost
@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
if "%MAVEN_BATCH_PAUSE%"=="on" pause
if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE%
cmd /C exit /B %ERROR_CODE%
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.9.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.zzsn</groupId>
<artifactId>weixinCrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>weixinCrawler</name>
<description>weixinCrawler</description>
<properties>
<failOnMissingWebXml>false</failOnMissingWebXml>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>jackson-all</groupId>
<artifactId>jackson-all</artifactId>
<version>1.7.6</version>
<scope>system</scope>
<systemPath>${pom.basedir}/lib/jackson-all-1.7.6.jar</systemPath>
</dependency>
<dependency>
<groupId>jedis</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
<scope>system</scope>
<systemPath>${pom.basedir}/lib/jedis-3.0.1.jar</systemPath>
</dependency>
<!-- xml解析 -->
<dependency>
<groupId>jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
</dependency>
<!---->
<dependency>
<groupId>io.protostuff</groupId>
<artifactId>protostuff-core</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>io.protostuff</groupId>
<artifactId>protostuff-runtime</artifactId>
<version>1.6.0</version>
</dependency>
<!-- http 工具 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>com.burgstaller</groupId>
<artifactId>okhttp-digest</artifactId>
<version>1.15</version>
</dependency>
<!-- mybatis-plus -->
<!--<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.4.1</version>
</dependency>-->
<!-- 数据库连接池 -->
<!--<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>-->
<!--redis依赖-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.13</version>
</dependency>
<!-- kafka依赖添加 -->
<dependency>
<groupId>org.springframework.kafka</groupId>
<artifactId>spring-kafka</artifactId>
<!-- <version>2.1.0.RELEASE</version>-->
</dependency>
<!--xpath解析-->
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.2</version>
</dependency>
<dependency>
<groupId>com.whalin</groupId>
<artifactId>Memcached-Java-Client</artifactId>
<version>3.0.2</version>
</dependency>
<dependency>
<groupId>net.spy</groupId>
<artifactId>spymemcached</artifactId>
<version>2.12.2</version>
</dependency>
<dependency>
<groupId>com.googlecode.xmemcached</groupId>
<artifactId>xmemcached</artifactId>
<version>2.4.7</version>
</dependency>
<dependency>
<groupId>cn.wanghaomiao</groupId>
<artifactId>JsoupXpath</artifactId>
<version>2.5.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
<!-- spring定时任务 -->
<dependency>
<groupId>org.quartz-scheduler</groupId>
<artifactId>quartz</artifactId>
<version>2.2.1</version>
</dependency>
<!-- 该依赖必加,里面有sping对schedule的支持 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.3.8</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-devtools</artifactId>
<scope>runtime</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>RELEASE</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<mainClass>com.zzsn.WeixinCrawlerApplication</mainClass>
<includeSystemScope>true</includeSystemScope><!--外部进行打包-->
<excludes>
<exclude>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</exclude>
</excludes>
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package com.zzsn;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.builder.SpringApplicationBuilder;
import org.springframework.boot.web.servlet.support.SpringBootServletInitializer;
//@SpringBootApplication
@SpringBootApplication(scanBasePackages = "com.zzsn")
public class WeixinCrawlerApplication extends SpringBootServletInitializer {
@Override
protected SpringApplicationBuilder configure(SpringApplicationBuilder builder) {
return builder.sources(WeixinCrawlerApplication.class);
}
public static void main(String[] args) {
SpringApplication.run(WeixinCrawlerApplication.class, args);
}
}
package com.zzsn.awx.controller;
import cn.hutool.core.date.DateTime;
import com.alibaba.fastjson.JSON;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.crawler.WeixinDetailThread;
import com.zzsn.entity.SiteMsgRecord;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.entity.Wxurl;
import com.zzsn.job.JedisUtil;
import com.zzsn.awx.service.ApiService;
import com.zzsn.awx.service.SiteService;
import com.zzsn.util.Constants;
import com.zzsn.util.WeixinUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 后台跳转控制类
* 创建人:李东亮
* 创建时间:2015-5-7 下午6:52:37
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Slf4j
@Controller
@RequestMapping("/wxt")
public class WeixinController {
// http://localhost:8079/wxt/dofiddlerback?wxurl=1
@RequestMapping(value ="/test", method = RequestMethod.GET)
@ResponseBody
public String test(){
return "hello!";
}
@RequestMapping("dofiddlerback")
public @ResponseBody String doFiddlerback(String wxurl,String weixinxml) throws Exception
{
KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
// System.out.println(wxurl);
// System.out.println(weixinxml);
String patt = "http:.{200,300}#wechat_redirect";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(weixinxml);
int count=0;
Date collectTime=null;
String infoSourceId="";
while (m.find()) {
String weixinurl=m.group(0).replaceAll("\\\\","").replaceAll("amp;","");
//判断是否爬取
// String keyanv= JedisUtil.getString(weixinurl);
// if(StringUtils.isEmpty(keyanv)){
SiteMsgTemple site=new SiteMsgTemple();
site.setSiteUri(weixinurl);
ObjectMapper mapper = new ObjectMapper();
String docjson = mapper.writeValueAsString(site);
// kafkaTemplate.send(Constants.KAFKA_WXDETAILURL_TOPIC, "key", docjson);
WeixinDetailThread weixinDetailThread=new WeixinDetailThread();
String weixinid=weixinDetailThread.getParam(weixinurl);
String siteStr = JedisUtil.getString(":"+weixinid);
SiteMsgTemple siteMsgTemple = JSON.parseObject(siteStr, SiteMsgTemple.class);
siteMsgTemple.setSiteUri(weixinurl);
weixinDetailThread.siteMsgTemple=siteMsgTemple; //如何取到对应的微信公众号信息
collectTime= DateTime.now();
infoSourceId=siteMsgTemple.getId();
//根据链接请求下载资讯并发送到kafka
boolean flag = weixinDetailThread.detailCrawler();
if(flag) {
count++;
}
log.info("提取公众号详情信息url:"+weixinurl);
// }else{
// log.info("公众号详情url已爬取:"+weixinurl);
// }
}
if(count>0){
ObjectMapper mapper = new ObjectMapper();
try {
SiteMsgRecord siteMsgRecord =new SiteMsgRecord();
siteMsgRecord.setInfoSourceId(infoSourceId);
siteMsgRecord.setNum(count);
siteMsgRecord.setSource("1");
siteMsgRecord.setCollectTime(collectTime);
String docjson = mapper.writeValueAsString(siteMsgRecord);
System.out.println(docjson);
kafkaTemplate.send(Constants.KAFKA_COLLECT_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
} catch (JsonProcessingException e) {
// e.printStackTrace();
log.info("发送到kafka失败。");
}
}
return "ok";
}
@RequestMapping("dofiddlerbackurl")
public @ResponseBody String doFiddlerbackforurl(String wxurl,String oldurl) throws Exception
{
System.out.println(wxurl);
System.out.println(oldurl);
String keyid=SiteService.getParambyname(oldurl, "signature");
String wxid=JedisUtil.getString(keyid);
String wxidflag=JedisUtil.getString("apiflag"+keyid);
if(StringUtils.isNotEmpty(wxid)&&StringUtils.isNotEmpty(wxidflag)&&StringUtils.isNotEmpty(wxurl)) {
Wxurl wxurl1=new Wxurl();
wxurl1.setId(Long.valueOf(wxid));
wxurl1.setNurl(wxurl);
JedisUtil.setString(oldurl,wxurl,0);
}
return "123";
}
@RequestMapping("dofiddlerbackurl111")
public @ResponseBody String doFiddlerbackforurl111(String wxurl,String oldurl) throws Exception
{
System.out.println(wxurl);
System.out.println(oldurl);
String keyid=SiteService.getParambyname(oldurl, "signature");
String wxid=JedisUtil.getString(keyid);
// siteService.updateBasedata(wxid, wxurl);
return "123";
}
public static void main(String[] args) {
String patt = "http:.{200,300}#wechat_redirect";
String html="etime&quot;:1562742531,&quot;fakeid&quot;:&quot;3008169006&quot;,&quot;status&quot;:2,&quot;content&quot;:&quot;&quot;},&quot;app_msg_ext_info&quot;:{&quot;title&quot;:&quot;孩子,爸爸没有双手,依然可以抱你长大!&quot;,&quot;digest&quot;:&quot;一场意外事故,让他在13岁时失去了双臂。如今他照顾着九个月大的儿子。&quot;,&quot;content&quot;:&quot;&quot;,&quot;fileid&quot;:504808950,&quot;content_url&quot;:&quot;http:\\/\\/mp.weixin.qq.com\\/s?__biz=MzAwODE2OTAwNg==&amp;amp;mid=2652292603&amp;amp;idx=1&amp;amp;sn=7f49e032fab2b1df15ca01a4add8e968&amp;amp;chksm=809096bab7e71fac1bec47b17d37314c0948dd1ed61aea09abaa823f498f17e8511ce5bb08f2&amp;amp;scene=27#wechat_redirect&quot;,&quot;source_url&quot;:&quot;https:\\/\\/sina.cn\\/&quot;,&quot;cover&quot;:&quot;http:\\/\\/mmbiz.qpic.cn\\/mmbiz_jpg\\/x6iaHWKibUzk0J1dOiccqkceSyM6n6SngTicJyaUo7N7zGAtz7pzJOZ8PibUyibgEvQKIWzV5I0yLAiaeHPrLTEU9nW6A\\/0?wx_fmt=jpeg&quot;,&quot;subtype&quot;:9,&quot;is_multi&quot;:1,&quot;multi_app_msg_item_list&quot;:[{&quot;title&quot;:&quot;捉谣记&nbsp;|&nbsp;官方公布酒驾玛莎拉蒂女车主有间歇性精神病?消息不靠谱&quot;,&quot;digest&quot;:&quot;目前并没有任何在调查和处置该案的“官方”公布酒驾女车主有间歇性精神病。&quot;,&quot;content&quot;:&quot;&quot;,&quot;fileid&quot;:504808954,&quot;content_url&quot;:&quot;http:\\/\\/mp.weixin.qq.com\\/s?__biz=MzAwODE2OTAwNg==&amp;amp;mid=2652292603&amp;amp;idx=2&amp;amp;sn=ef7a35347bc9dceb79c0fa42f8840f79&amp;amp;chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&amp;amp;scene=27#wechat_redirect&quot;,&quot;source_url&quot;:&quot;https:\\/\\/sina.cn\\/&quot;,&quot;cover&quot;:&quot;http:\\/\\/mmbiz.qpic.cn\\/mmbiz_jpg\\/x6iaHWKibUzk0J1dOiccqkceSyM6n6SngTicPeCnYPoIzGSvXISGLQFdRIkV0ZrOODuczz0bxCFXXqfTIv6IOqVPtg\\/0?";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(html);
while (m.find()) {
System.out.println(1);
System.out.println(m.group(0));
System.out.println(m.group(0).replaceAll("\\\\","").replaceAll("amp;",""));
System.out.println(m.group(0).length());
}
WeixinUtil.sendWxMessage("lwg2468741258", "点击链接:"+"http://mp.weixin.qq.com/s?__biz=MzAwODE2OTAwNg==&mid=2652292603&idx=2&sn=ef7a35347bc9dceb79c0fa42f8840f79&chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&scene=27#wechat_redirect", 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
}
@RequestMapping("dofiddlerbackapp")
public @ResponseBody String doFiddlerbackapp(String wxurl,String weixinxml) throws Exception
{
System.out.println(wxurl);
System.out.println(weixinxml);
String patt = "http://m.toutiao.com.{10,30}\"";
Pattern p=Pattern.compile(patt);
Matcher m=p.matcher(weixinxml);
while (m.find()) {
String weixinurl=m.group(0).replaceAll("\\\\","").replaceAll("amp;","");
//判断是否爬取
//String keyanv=MemcachedFactory.getKeyStr(weixinurl);
String keyanv=null;
if(StringUtils.isEmpty(keyanv)){
SiteMsgTemple site=new SiteMsgTemple();
site.setSiteUri(weixinurl);
//mqSender.sendSite(site);
System.out.println("录入mq"+weixinurl);
}else{
System.out.println("yipaqu:"+weixinurl);
}
}
return "123";
}
}
package com.zzsn.awx.service;
import com.zzsn.entity.Wxurl;
import com.zzsn.job.JedisUtil;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class ApiService {
@Autowired
private SiteService siteService;
public Boolean suo_flag=true;// 单个请求锁,只允许一个人访问
public String getNurl(String oldurl){
if(!suo_flag) {
return "0";
}
suo_flag=false;
try {
Wxurl url=new Wxurl();
url.setFlag(0L);
url.setOurl(oldurl);
//15秒内轮巡数据库 是否转换成功,
for (int i = 0; i < 15; i++) {
List<Wxurl> list1=new ArrayList<>();
if(list1.size()>0) {
Long flag=list1.get(0).getFlag();
String nurl=list1.get(0).getNurl();
if(flag.longValue()==1&& StringUtils.isNotEmpty(nurl)) {
return nurl;
}
Thread.sleep(1000L);
System.out.println(i);
}else {
// wxurlDao.save(url);
}
}
} catch (Exception e) {
e.printStackTrace();
}
suo_flag=true;
return "0";
}
public static String apiflag="apiflag";
public void zhuanhuanurl(String wxurl){
Thread t = new Thread(new Runnable(){
public void run(){
do {
try {
Wxurl wxurl=new Wxurl();
List<Wxurl> list=new ArrayList<>();
list.add(wxurl);
if(list.size()>0){
Wxurl b=list.get(0);
//判断是否已转换,已转换则不去处理
Long flag=b.getFlag();
if(1L==flag) {
System.out.println("meiyouxin_url");
Thread.sleep(5*1000);
}
String souceid=SiteService.getParambyname(b.getOurl(), "signature");
JedisUtil.setString(souceid,b.getId()+"",600);
JedisUtil.setString(apiflag+souceid,b.getId()+"",600);
//判断是否url正常
if(StringUtils.isNotEmpty(souceid)) {
siteService.sendurl(b.getOurl());
}else {
System.out.println("wentiurl"+b.getOurl());
}
}
Thread.sleep(5*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} while (true);
}
});
t.start();
}
}
\ No newline at end of file
package com.zzsn.awx.service;
import com.alibaba.fastjson.JSON;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.DateUtil;
import com.zzsn.util.WeixinUtil;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.util.*;
/**
* 爬虫service
* 创建人:李东亮
* 创建时间:2016-4-13 下午2:52:20
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Service
public class SiteService {
private static final Logger Log = LoggerFactory.getLogger(SiteService.class);
private static Long id=0L;
public void sendUrlToweixin(SiteMsgTemple siteMsgTemple){
String url=siteMsgTemple.getSiteUri();
String weixinhaoid="";
try {
weixinhaoid = getParam(url);
String msg= JSON.toJSONString(siteMsgTemple);
JedisUtil.setString(":"+weixinhaoid,msg,0);
} catch (Exception e) {
e.printStackTrace();
}
if(weixinhaoid!=null&&weixinhaoid.trim().length()>0){
clearweixinhaoid(weixinhaoid);
}
//20秒发送一次链接给微信
sendurl(url);
try {
//将信息缓存到redis 以便后续查询使用
Thread.sleep(1000*30);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public synchronized void sendurl (String url){
try {
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(30*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public synchronized void sendurl1 (String url,String wxname){
try {
WeixinUtil.sendWxMessage(wxname, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
Thread.sleep(20*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(20*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
public static void clearweixinhaoid(String weixinhaoid){
try {
JedisUtil.del(weixinhaoid);
}catch (Exception e){
}
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
public static String getParambyname(String url,String name) {
Map<String,String> map=parse(url);
return map.get(name);
}
public static String getweixinId(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public static String getweixinId1(String s) {
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("&mid=");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
return ss;
}
public void crawlerweixin(SiteMsgTemple siteMsgTemple) throws Exception{
String weixinurl=siteMsgTemple.getSiteUri();
//判断是否yipaqu
String urlflag=JedisUtil.getString(weixinurl);
if(!StringUtils.isEmpty(urlflag)){
System.out.println("已爬取1"+weixinurl);
return;
}
//查询组织
String weixinid=getParam(weixinurl);
String organdtids=JedisUtil.getString(weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
String contentWithTag = "";
contentNoTag = ContentUtility.TransferHTML2Text(contentWithTag);
DocInfo docInfo=new DocInfo();
docInfo.setSourceType("WeChat");
// docInfo.setLastModified(lastModified);
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
// docInfo.setKeywords(extEntity.getKeywords());
//docInfo.setSummary(extEntity.getSummary());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
Iterator<String> it =null;
// Iterator<String> it = organdtids.iterator();
while (it.hasNext()) {
String str = it.next();
System.out.println(str);
//解析ITD ORGID
String[] ss= str.split("-");
if(ss.length!=3){
return;
}else{
System.out.println(str);
System.out.println(ss.toString());
}
String orgid=ss[0];
String tid=ss[1];
String sid=ss[2];
docInfo.setOrgId(Long.valueOf(orgid));
docInfo.setSid(Long.valueOf(sid));
Map<String, String> params = new HashMap<String, String>();
params.put("fromWhere", "weixincraw");
if (null!=tid&&!"null".equals(tid)) {
params.put("tid", tid);
}
docInfo.setOtherParams(params);
String week = DateUtil.getDateBeforeDays(new Date() , 2);
if(docInfo.getTitle()==null){
if(StringUtils.isEmpty(contentNoTag)){
//空了继续爬 不空爬不下来记录了
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}else if(docInfo.getPublishDate().compareTo(week)<0){
//1天前外事办项目不推
System.out.println("时间过期"+docInfo.getPublishDate());
JedisUtil.setString(weixinurl, 1+"",0);
//其他项目还是推
}else{
JedisUtil.setString(weixinurl, 1+"",0);
}
}
}
public static void main(String[] args) {
String s="https://mp.weixin.qq.com/mp/profile_ext?action=home&scene=114&__biz=MzAwODE2OTAwNg==#wechat_redirect";
System.out.println(getParam(s));
Integer start=s.indexOf("__biz=");
Integer end=s.indexOf("#wechat");
String ss=null;
if(start>0 && end>0){
ss=s.substring(start+6, end);
}
System.out.println(ss);
ss=getParam(s);
System.out.println(ss);
String time="2019-11-18 12:20:23";
String week = DateUtil.getDateBeforeDays(new Date() , 1);
System.out.println(time.compareTo(week));
String sss="http://mp.weixin.qq.com/s?__biz=MzUxMzEzNjg1Ng==&mid=2247484003&idx=1&sn=965ca574850ab65be466c443bf8e2a3b&scene=0965ca574850ab65be466c443bf8e2a3b";
sss=getParambyname(sss, "signature");
System.out.println(sss);
}
}
\ No newline at end of file
package com.zzsn.common;
/**
* 微信通用接口凭证
*
* @author liufeng
* @date 2013-08-08
*/
public class AccessToken {
// 获取到的凭证
private String token;
// 凭证有效时间,单位:秒
private int expiresIn;
public String getToken() {
return token;
}
public void setToken(String token) {
this.token = token;
}
public int getExpiresIn() {
return expiresIn;
}
public void setExpiresIn(int expiresIn) {
this.expiresIn = expiresIn;
}
}
package com.zzsn.common;
import javax.net.ssl.X509TrustManager;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
/**
* 证书信任管理器(用于https请求)
*
* @author liufeng
* @date 2013-08-08
*/
public class MyX509TrustManager implements X509TrustManager {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
}
public X509Certificate[] getAcceptedIssuers() {
return null;
}
}
\ No newline at end of file
package com.zzsn.common;
public class TextContent {
private String content;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
package com.zzsn.common.cache;
import java.util.HashMap;
import java.util.Map;
/**
*
* @ClassName: IGatRedisKey
* @Description:
* @author: renkai721
* @date: 2018年6月25日 下午4:54:42
*/
public interface IGatRedisKey {
public static Map<String, String> register_map = new HashMap<String, String>();
public final String NO = "no";
public final String OK = "ok";
public final String HTTP = "http://";
public final String F = ";";
public final String _ = "_";
}
package com.zzsn.common.cache;
import com.google.code.yanf4j.core.impl.StandardSocketOption;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.command.BinaryCommandFactory;
import net.rubyeye.xmemcached.transcoders.SerializingTranscoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
* Memcached客户端
*/
public class Memcached {
private static Logger logger = LoggerFactory.getLogger(Memcached.class);
private static MemcachedClient client = null;
static {
logger.debug("memcached initialize...");
Properties prop = getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
int[] weights = new int[servers.length];
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
String weight = prop.getProperty("memcached.server"+(i+1)+".weight");
if (weight == null || weight.isEmpty()) {
weights[i] = 1;
} else {
weights[i] = Integer.parseInt(weight);
}
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList, weights);
String poolSize = prop.getProperty("memcached.connectionPoolSize");
if (poolSize != null && !poolSize.isEmpty()) {
builder.setConnectionPoolSize(Integer.parseInt(poolSize));
}
String failureMode = prop.getProperty("memcached.failureMode");
if (failureMode != null && !failureMode.isEmpty()) {
builder.setFailureMode(Boolean.parseBoolean(failureMode));
}
String connTimeout = prop.getProperty("memcached.connectTimeout");
if (connTimeout != null && !connTimeout.isEmpty()) {
builder.setConnectTimeout(Integer.parseInt(connTimeout));
}
String opTimeout = prop.getProperty("memcached.opTimeout");
if (opTimeout != null && !opTimeout.isEmpty()) {
builder.setOpTimeout(Integer.parseInt(opTimeout));
}
String enableHealSession = prop.getProperty("memcached.enableHealSession");
if (enableHealSession != null && !enableHealSession.isEmpty()) {
builder.setEnableHealSession(Boolean.parseBoolean(enableHealSession));//启用或者禁止连接修复
}
String statistics = prop.getProperty("memcached.statistics");
if (statistics != null && !statistics.isEmpty()) {
builder.getConfiguration().setStatisticsServer(Boolean.parseBoolean(statistics));
}
String binary = prop.getProperty("memcached.binaryCommand");
if (binary != null && "true".equals(binary)) {
builder.setCommandFactory(new BinaryCommandFactory());
}
builder.setTranscoder(new SerializingTranscoder());
builder.setSocketOption(StandardSocketOption.SO_RCVBUF, 32* 1024);// 设置接收缓存区为32K,默认16K
builder.setSocketOption(StandardSocketOption.SO_SNDBUF,16 *1024); // 设置发送缓冲区为16K,默认为8K
builder.setSocketOption(StandardSocketOption.TCP_NODELAY,true); // 启用nagle算法,提高吞吐量,默认关闭
String sessionIdleTimeout = prop.getProperty("memcahced.sessionIdleTimeout");
if (sessionIdleTimeout != null && !sessionIdleTimeout.isEmpty()) {
builder.getConfiguration().setSessionIdleTimeout(Integer.parseInt(sessionIdleTimeout)*1000); // 如果连接超过x秒没有任何IO操作发生即认为空闲并发起心跳检测
}
try {
client = builder.build();
String optimizeMergeBuffer = prop.getProperty("memcached.optimizeMergeBuffer");
if (optimizeMergeBuffer != null && !optimizeMergeBuffer.isEmpty()) {
client.setOptimizeMergeBuffer(Boolean.parseBoolean(optimizeMergeBuffer));
}
String mergeFactor = prop.getProperty("memcached.mergeFactor");
if (mergeFactor != null && !optimizeMergeBuffer.isEmpty()) {
client.setMergeFactor(Integer.parseInt(mergeFactor));
}
logger.debug("memcached initialize completed!");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private Memcached() {
}
public static MemcachedClient getClient() {
return client;
}
public static void shutdown(MemcachedClient client) {
if (client != null) {
try {
client.shutdown();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static Properties getConfig() {
Properties properties = new Properties();
InputStream is = null;
String location = "memcached.properties";
try {
Resource resource = new DefaultResourceLoader().getResource(location);
is = resource.getInputStream();
properties.load(is);
logger.debug("memcached config: {}", properties.toString());
} catch (IOException ex) {
logger.error("Could not load property file:" + location, ex);
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException ioe) {
// ignore
}
}
return properties;
}
}
package com.zzsn.common.cache;
import net.rubyeye.xmemcached.KeyIterator;
import net.rubyeye.xmemcached.MemcachedClient;
import net.rubyeye.xmemcached.XMemcachedClientBuilder;
import net.rubyeye.xmemcached.exception.MemcachedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
/**
* @Author Sugar
* @Version 2018/6/8 13:22
*/
public class MemcachedAdmin {
private static Logger logger = LoggerFactory.getLogger(MemcachedAdmin.class);
private static MemcachedClient client = null;
private MemcachedAdmin() {
}
private static MemcachedClient getClient() {
if (client != null) {
return client;
}
Properties prop = Memcached.getConfig();
String server = prop.getProperty("memcached.server");
if (server == null || server.isEmpty()) {
throw new IllegalArgumentException("The property 'memcached.server' is not found in memcached.properties file!");
}
String[] servers = server.split(",");
List<InetSocketAddress> addressList = new ArrayList<>(servers.length);
for (int i = 0; i < servers.length; i++) {
String[] addr = servers[i].split(":");
addressList.add(new InetSocketAddress(addr[0], Integer.parseInt(addr[1])));
}
XMemcachedClientBuilder builder = new XMemcachedClientBuilder(addressList);
try {
client = builder.build();
} catch (IOException e) {
e.printStackTrace();
}
return client;
}
@Deprecated
public static long deleteAll(String keyPrefix) {
AtomicLong count = new AtomicLong();
MemcachedClient client = getClient();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
boolean result = client.delete(key);
long i = count.incrementAndGet();
if (logger.isDebugEnabled()) {
logger.debug("[{}] Delete key[{}]: {}={}", inet, i, key, result);
}
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
logger.info("Delete a total of {} keys starting with {}", count.get(), keyPrefix);
return count.get();
}
@Deprecated
public static List<String> getAllKey(String keyPrefix) {
MemcachedClient client = getClient();
List<String> keys = new ArrayList<>();
client.getAvailableServers().forEach(inet -> {
try {
KeyIterator iterator = client.getKeyIterator(inet);
while (iterator.hasNext()) {
String key = iterator.next();
if (key.startsWith(keyPrefix)) {
keys.add(key);
}
}
} catch (MemcachedException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (TimeoutException e) {
e.printStackTrace();
}
});
return keys;
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port) {
return addServer(host, port, 1);
}
/**
* 动态添加一台服务
* @param host
* @param port
* @return
*/
public static boolean addServer(String host, int port, int weight) {
try {
Memcached.getClient().addServer(host, port, weight);
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
}
package com.zzsn.common.cache;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.serializer.SerializerFeature;
import com.whalin.MemCached.MemCachedClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* memcached
*
* @Author Sugar
* @Version 2018/3/15 16:44
*/
public class MemcachedUtils {
public final static int DEFAULT_EXPIRES = 3600*24*10; //默认过期时间(秒)
private final static Logger logger = LoggerFactory.getLogger(MemcachedUtils.class);
/**
* 缓存数据(默认缓存1小时)
*
* @param key
* @param data
*/
public static void put(String key, Object data) {
put(key, data, DEFAULT_EXPIRES);
}
/**
* 缓存数据
*
* @param key
* @param data
* @param seconds 过期时间(秒)
*/
public static void put(String key, Object data, int seconds) {
try {
Memcached.getClient().setWithNoReply(key, seconds, data);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 缓存对象(默认缓存1小时)
*
* @param key
* @param value <T>
*/
public static <T> void putObject(String key, T value) {
putObject(key, value, DEFAULT_EXPIRES);
}
/**
* 缓存对象
*
* @param key
* @param value
* @param seconds 缓存时间(秒)
*/
public static <T> void putObject(String key, T value, int seconds) {
try {
byte[] data = null;
if (value instanceof Map) {
data = JSONObject.toJSONBytes(value, SerializerFeature.DisableCircularReferenceDetect);
} else if(value instanceof Iterable) {
data = JSONArray.toJSONBytes(value, SerializerFeature.DisableCircularReferenceDetect);
} else {
data = ProtostuffUtil.serializer(value);
}
put(key, data, seconds);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 缓存对象(默认缓存1小时)
*
* @param key
* @param value
* @return
*/
public static void putList(String key, List value) {
setList(key, value, DEFAULT_EXPIRES);
}
/**
* 缓存对象
*
* @param key
* @param value
* @param seconds 缓存时间(秒)
* @return
*/
public static <T> void putList(String key, List<T> value, int seconds) {
try {
T obj = value.get(0);
if (obj instanceof Map) {
put(key, JSONArray.toJSONBytes(value), seconds);
} else {
Class<?> type = obj.getClass();
if (type.isArray() || type.getTypeName().startsWith("java.lang")) {
put(key, value, seconds);
} else {
put(key, ProtostuffUtil.serializeList(value), seconds);
}
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 缓存数据
*
* @param key
* @param data
* @param seconds 过期时间(秒)
* @return
*/
public static boolean set(String key, Object data, int seconds) {
try {
return Memcached.getClient().set(key, seconds, data);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return false;
}
}
/**
* 缓存对象(默认缓存1小时)
*
* @param key
* @param value <T>
* @return
*/
public static <T> boolean setObject(String key, T value) {
return setObject(key, value, DEFAULT_EXPIRES);
}
/**
* 缓存对象
*
* @param key
* @param value
* @param seconds 缓存时间(秒)
* @return
*/
public static <T> boolean setObject(String key, T value, int seconds) {
try {
byte[] data = null;
if (value instanceof Map) {
data = JSONObject.toJSONBytes(value, SerializerFeature.DisableCircularReferenceDetect);
} else if(value instanceof Iterable) {
data = JSONArray.toJSONBytes(value, SerializerFeature.DisableCircularReferenceDetect);
} else {
data = ProtostuffUtil.serializer(value);
}
return set(key, data, seconds);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
return false;
}
/**
* 缓存对象(默认缓存1小时)
*
* @param key
* @param value
* @return
*/
public static boolean setList(String key, List value) {
return setList(key, value, DEFAULT_EXPIRES);
}
/**
* 缓存对象
*
* @param key
* @param value
* @param seconds 缓存时间(秒)
* @return
*/
public static <T> boolean setList(String key, List<T> value, int seconds) {
try {
T obj = value.get(0);
if (obj instanceof Map) {
return set(key, JSONArray.toJSONBytes(value), seconds);
} else {
Class<?> type = obj.getClass();
if (type.isArray() || type.getTypeName().startsWith("java.lang")) {
return set(key, value, seconds);
} else {
return set(key, ProtostuffUtil.serializeList(value), seconds);
}
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
return false;
}
/**
* 读取缓存数据
*
* @param key
* @return
*/
public static <T> T get(String key) {
try {
return Memcached.getClient().get(key);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 获取对象值
*
* @param key
* @param clazz <T>
* @return
*/
public static <T> T getObject(String key, Class<T> clazz) {
byte[] bytes = get(key);
if (bytes == null || bytes.length == 0) {
return null;
}
try {
//如果数据是json格式{}
if (clazz.isAssignableFrom(Map.class) || (bytes.length > 0 && bytes[0] == 123 && bytes[bytes.length - 1] == 125)) {
return JSONObject.parseObject(bytes, clazz);
}
return ProtostuffUtil.deserializer(bytes, clazz);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 获取列表
*
* @param key
* @return
*/
public static <T> List<T> getList(String key, Class<T> clazz) {
try {
if (clazz.isAssignableFrom(Map.class)) {
byte[] data = get(key);
if (data == null || data.length == 0) {
return null;
}
return JSONArray.parseObject(data, List.class);
} else if (clazz.isArray() || clazz.getTypeName().startsWith("java.lang")) {
return get(key);
} else {
byte[] data = get(key);
if (data == null || data.length == 0) {
return null;
}
return ProtostuffUtil.deserializeList(data, clazz);
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 获取String值
*
* @param key
* @return 不存在返回空字符串
*/
public static String getString(String key) {
String val = get(key);
return val == null ? "" : val;
}
/**
* 读取int值
*
* @param key
* @return 不存在返回0
*/
public static int getInt(String key) {
Integer val = get(key);
return val == null ? 0 : val.intValue();
}
/**
* 读取long值
*
* @param key
* @return 不存在返回0
*/
public static long getLong(String key) {
Long val = get(key);
return val == null ? 0 : val.longValue();
}
/**
* 读取float值
*
* @param key
* @return 不存在返回0
*/
public static float getFloat(String key) {
Float val = get(key);
return val == null ? 0 : val.floatValue();
}
/**
* 读取double值
*
* @param key
* @return 不存在返回0
*/
public static double getDouble(String key) {
Double val = get(key);
return val == null ? 0 : val.doubleValue();
}
/**
* 读取boolean值
*
* @param key
* @return 不存在返回false
*/
public static boolean getBoolean(String key) {
Boolean val = get(key);
return val == null ? false : val.booleanValue();
}
/**
* 将存储在键上的数字+1
*
* @param key
* @return 成功返回新的结果, 失败返回null
*/
public static Long incrAndGet(String key) {
return incrAndGet(key, 1);
}
/**
* 将存储在键上的数字按指定的值增加
*
* @param key
* @param value 增加的值
* @return 成功返回新的结果, 失败返回null
*/
public static Long incrAndGet(String key, long value) {
try {
return Memcached.getClient().incr(key, value);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 将存储在键上的数字+1
*
* @param key
*/
public static void incr(String key) {
incr(key, 1);
}
/**
* 将存储在键上的数字按指定的值增加
*
* @param key
* @param value 增加的值
*/
public static void incr(String key, long value) {
try {
Memcached.getClient().incrWithNoReply(key, value);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 将存储在键上的数字-1
*
* @param key
* @return 成功返回新的结果, 失败返回null
*/
public static Long decrAngGet(String key) {
return decrAngGet(key, 1);
}
/**
* 将存储在键上的数字按指定的值减少
*
* @param key
* @param value 减少的值
* @return 成功返回新的结果, 失败返回null
*/
public static Long decrAngGet(String key, long value) {
try {
return Memcached.getClient().decr(key, value);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return null;
}
}
/**
* 将存储在键上的数字-1
*
* @param key
*/
public static void decr(String key) {
decr(key, 1);
}
/**
* 将存储在键上的数字按指定的值减少
*
* @param key
* @param value 减少的值
*/
public static void decr(String key, long value) {
try {
Memcached.getClient().decrWithNoReply(key, value);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 设置过期时间
*
* @param key
* @param seconds
* @return
*/
public static boolean expire(String key, int seconds) {
try {
return Memcached.getClient().touch(key, seconds);
} catch (Exception e) {
logger.error(e.getMessage(), e);
return false;
}
}
/**
* 清空所有key
*
* @return
*/
public static void flushAll() {
try {
Memcached.getClient().flushAll();
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 删除指定key
*
* @return
*/
public static void del(String key) {
try {
Memcached.getClient().delete(key);
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 删除指定多个key
*
* @return
*/
public static void del(String... keys) {
try {
for (String key : keys) {
Memcached.getClient().delete(key);
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}
/**
* 判断key是否存在
*
* @param key
* @return boolean
*/
public static boolean exists(String key) {
try {
return Memcached.getClient().get(key) != null;
} catch (Exception e) {
logger.error(e.getMessage(), e);
return false;
}
}
public static List<String> getAllKeys(MemCachedClient mcc) {
List<String> list = new ArrayList<String>();
Map<String, Map<String, String>> items = mcc.statsItems();
for (Iterator<String> itemIt = items.keySet().iterator(); itemIt.hasNext();) {
String itemKey = itemIt.next();
Map<String, String> maps = items.get(itemKey);
for (Iterator<String> mapsIt = maps.keySet().iterator(); mapsIt.hasNext();) {
String mapsKey = mapsIt.next();
String mapsValue = maps.get(mapsKey);
if (mapsKey.endsWith("number")) { // memcached key 类型
// item_str:integer:number_str
String[] arr = mapsKey.split(":");
int slabNumber = Integer.valueOf(arr[1].trim());
int limit = Integer.valueOf(mapsValue.trim());
Map<String, Map<String, String>> dumpMaps = mcc.statsCacheDump(slabNumber,
limit);
for (Iterator<String> dumpIt = dumpMaps.keySet().iterator(); dumpIt
.hasNext();) {
String dumpKey = dumpIt.next();
Map<String, String> allMap = dumpMaps.get(dumpKey);
for (Iterator<String> allIt = allMap.keySet().iterator(); allIt
.hasNext();) {
String allKey = allIt.next();
list.add(allKey.trim());
}
}
}
}
}
return list;
}
public static void main(String[] args) {
MemcachedUtils.put("lwg","liuweigang1",30);
MemcachedUtils.put("lwg","liuweigang2",30);
Object lwg = MemcachedUtils.get("lwg1");
System.out.println(lwg.toString());
}
}
package com.zzsn.common.cache;
import io.protostuff.*;
import io.protostuff.runtime.RuntimeSchema;
import org.springframework.objenesis.Objenesis;
import org.springframework.objenesis.ObjenesisStd;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @Author Sugar
* @Version 2018/3/16 13:32
*/
public class ProtostuffUtil {
public static final Objenesis objenesis = new ObjenesisStd(true);
private static final String ERR_TRUNCATED_MESSAGE =
"While parsing a protocol message, the input ended unexpectedly " +
"in the middle of a field. This could mean either than the " +
"input has been truncated or that an embedded message " +
"misreported its own length.";
/**
* 序列化
*
* @param obj
* @return
*/
public static <T> byte[] serializer(T obj) {
LinkedBuffer buffer = LinkedBuffer.allocate(LinkedBuffer.DEFAULT_BUFFER_SIZE);
try {
Schema<T> schema = RuntimeSchema.getSchema((Class<T>) obj.getClass());
return ProtostuffIOUtil.toByteArray(obj, schema, buffer);
} catch (Exception e) {
throw new IllegalStateException("序列化对象失败:" + obj, e);
} finally {
buffer.clear();
}
}
/**
* 反序列化
*
* @param data
* @param clazz
* @return
*/
public static <T> T deserializer(byte[] data, Class<T> clazz) {
T obj = null;
try {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
// obj = schema.newMessage();
obj = objenesis.newInstance(clazz);
ProtostuffIOUtil.mergeFrom(data, obj, schema);
} catch (Exception e) {
throw new IllegalStateException("反序列化对象失败:class=" + clazz + ", data=" + new String(data), e);
}
return obj;
}
public static <T> byte[] serializeList(List<T> list) {
@SuppressWarnings("unchecked")
Schema<T> schema = (Schema<T>) RuntimeSchema.getSchema(list.get(0).getClass());
LinkedBuffer buffer = LinkedBuffer.allocate(1024 * 1024);
ByteArrayOutputStream bos = null;
try {
bos = new ByteArrayOutputStream();
ProtostuffIOUtil.writeListTo(bos, list, schema, buffer);
return bos.toByteArray();
} catch (Exception e) {
throw new IllegalStateException("序列化对象列表失败:" + list, e);
} finally {
buffer.clear();
try {
if (bos != null) {
bos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 反序列化对象列表
*
* @param data
* @param clazz
* @param <T>
* @return
*/
public static <T> List<T> deserializeList(byte[] data, Class<T> clazz) {
Schema<T> schema = RuntimeSchema.getSchema(clazz);
List<T> result = null;
try {
result = parseListFrom(new ByteArrayInputStream(data), schema, clazz);
} catch (IOException e) {
throw new IllegalStateException("反序列化对象列表失败:class=" + clazz + ", data=" + new String(data), e);
}
return result;
}
private static <T> List<T> parseListFrom(final InputStream in, final Schema<T> schema, Class<T> clazz)
throws IOException {
int size = in.read();
if (size == -1) {
return Collections.emptyList();
}
if (size > 0x7f) {
size = readRawVarint32(in, size);
}
final ArrayList<T> list = new ArrayList<T>(size);
final CodedInput input = new CodedInput(in, true);
for (int i = 0; i < size; i++) {
// final T message = schema.newMessage();
final T message = objenesis.newInstance(clazz);//使用objensis代替newInstance()
list.add(message);
schema.mergeFrom(input, message);
input.checkLastTagWas(0);
}
assert in.read() == -1;
return list;
}
/**
* Reads a varint from the input one byte at a time, so that it does not read any bytes after the end of the varint.
* If you simply wrapped the stream in a CodedInput and used readRawVarint32(InputStream) then you would
* probably end up reading past the end of the varint since CodedInput buffers its input.
*/
private static int readRawVarint32(final InputStream input, final int firstByte) throws IOException {
int result = firstByte & 0x7f;
int offset = 7;
for (; offset < 32; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
result |= (b & 0x7f) << offset;
if ((b & 0x80) == 0) {
return result;
}
}
// Keep reading up to 64 bits.
for (; offset < 64; offset += 7) {
final int b = input.read();
if (b == -1) {
throw new ProtobufException(ERR_TRUNCATED_MESSAGE);
}
if ((b & 0x80) == 0) {
return result;
}
}
throw new ProtobufException(
"CodedInput encountered a malformed varint.");
}
}
//package com.zzsn.configuration;
//
//import com.baomidou.mybatisplus.autoconfigure.ConfigurationCustomizer;
//import com.baomidou.mybatisplus.extension.plugins.PaginationInterceptor;
//import org.springframework.context.annotation.Bean;
//import org.springframework.context.annotation.Configuration;
//
//
//@Configuration
//public class MybatisPlusConfig {
//
// /**
// * 新的分页插件,一缓和二缓遵循mybatis的规则,需要设置 MybatisConfiguration#useDeprecatedExecutor = false 避免缓存出现问题
// */
//// @Bean
//// public MybatisPlusInterceptor mybatisPlusInterceptor() {
//// MybatisPlusInterceptor interceptor = new MybatisPlusInterceptor();
//// //注释下面的可能出现获取不到总数的效果
//// interceptor.addInnerInterceptor(new PaginationInnerInterceptor(DbType.MYSQL));
//// return interceptor;
//// }
// @Bean
// public PaginationInterceptor paginationInterceptor() {
// return new PaginationInterceptor();
// }
//
// @Bean
// public ConfigurationCustomizer configurationCustomizer() {
// return configuration -> configuration.setUseDeprecatedExecutor(false);
// }
//
//}
\ No newline at end of file
package com.zzsn.configuration;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.TimeUnit;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import okhttp3.ConnectionPool;
import okhttp3.OkHttpClient;
@Configuration
public class OkHttpConfiguration {
@Bean
public OkHttpClient okHttpClient() {
return new OkHttpClient.Builder()
// .sslSocketFactory(sslSocketFactory(), x509TrustManager())
.retryOnConnectionFailure(false).connectionPool(pool()).connectTimeout(30, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS).writeTimeout(30, TimeUnit.SECONDS).build();
}
@Bean
public X509TrustManager x509TrustManager() {
return new X509TrustManager() {
@Override
public void checkClientTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
}
@Override
public void checkServerTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
}
@Override
public X509Certificate[] getAcceptedIssuers() {
return new X509Certificate[0];
}
};
}
@Bean
public SSLSocketFactory sslSocketFactory() {
try {
// 信任任何链接
SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, new TrustManager[] { x509TrustManager() }, new SecureRandom());
return sslContext.getSocketFactory();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyManagementException e) {
e.printStackTrace();
}
return null;
}
/**
* Create a new connection pool with tuning parameters appropriate for a single-user
* application. The tuning parameters in this pool are subject to change in future OkHttp
* releases. Currently
*/
@Bean
public ConnectionPool pool() {
return new ConnectionPool(200, 5, TimeUnit.MINUTES);
}
}
//package com.zzsn.configuration;
//
//import org.springframework.context.annotation.Configuration;
//import org.springframework.scheduling.annotation.SchedulingConfigurer;
//import org.springframework.scheduling.config.ScheduledTaskRegistrar;
//
//import java.util.concurrent.Executors;
//
//@Configuration
//public class ScheduleConfig implements SchedulingConfigurer {
// @Override
// public void configureTasks(ScheduledTaskRegistrar taskRegistrar) {
// //当然了,这里设置的线程池是corePoolSize也是很关键了,自己根据业务需求设定
// taskRegistrar.setScheduler(Executors.newScheduledThreadPool(1));
//
//
// /**为什么这么说呢?
// 假设你有4个任务需要每隔1秒执行,而其中三个都是比较耗时的操作可能需要10多秒,而你上面的语句是这样写的:
// taskRegistrar.setScheduler(Executors.newScheduledThreadPool(3));
// 那么仍然可能导致最后一个任务被阻塞不能定时执行
// **/
// }
//}
package com.zzsn.configuration;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
import java.util.Map;
/**
* 获取Spring的ApplicationContext对象工具,可以用静态方法的方式获取spring容器中的bean
* @author https://blog.csdn.net/chen_2890
* @date 2019/6/26 16:20
*/
@Component
public class SpringContextUtil implements ApplicationContextAware {
private static ApplicationContext applicationContext;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
SpringContextUtil.applicationContext = applicationContext;
}
/**
* 获取applicationContext
*/
public static ApplicationContext getApplicationContext() {
return applicationContext;
}
/**
* 通过name获取 Bean.
*/
public static Object getBean(String name) {
Object o = null;
try {
o = getApplicationContext().getBean(name);
} catch (NoSuchBeanDefinitionException e) {
// e.printStackTrace();
}
return o;
}
/**
* 通过class获取Bean.
*/
public static <T> T getBean(Class<T> clazz) {
return getApplicationContext().getBean(clazz);
}
/**
* 通过name,以及Clazz返回指定的Bean
*/
public static <T> T getBean(String name, Class<T> clazz) {
return getApplicationContext().getBean(name, clazz);
}
/**
* 通过name获取 Bean.
*/
public static <T> Map<String, T> getBeansOfType(Class<T> clazz) {
return getApplicationContext().getBeansOfType(clazz);
}
/**
* 获取配置文件配置项的值
*
* @param key 配置项key
*/
public static String getEnvironmentProperty(String key) {
return getApplicationContext().getEnvironment().getProperty(key);
}
/**
* 获取spring.profiles.active
*/
public static String getActiveProfile() {
return getApplicationContext().getEnvironment().getActiveProfiles()[0];
}
}
package com.zzsn.configuration;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
import java.util.concurrent.Executor;
import java.util.concurrent.ThreadPoolExecutor;
@Configuration
@EnableAsync
public class ThreadExecutorConfig {
@Bean(value = "asyncTaskExecutor")
public Executor executor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(1);//线程池维护线程的最少数量
executor.setMaxPoolSize(1);//线程池维护线程的最大数量
executor.setQueueCapacity(5000);//缓存队列
executor.setThreadNamePrefix("ssmsExecutor-");
/**
* 对拒绝task的处理策略
rejection-policy:当pool已经达到max size的时候,如何处理新任务
CALLER_RUNS:不在新线程中执行任务,而是由调用者所在的线程来执行
*/
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.setKeepAliveSeconds(60);//允许的空闲时间
executor.initialize();
return executor;
}
}
\ No newline at end of file
package com.zzsn.crawler;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.awx.service.SiteService;
import org.springframework.beans.BeansException;
import java.util.TimerTask;
public class SiteTask extends TimerTask {
@Override
public void run() {
System.out.println("开是发送信息");
task();
}
public static void task(){
try {
SiteService sites= SpringContextUtil.getBean(SiteService.class);
// sites.sendUrlToweixin("");
} catch (BeansException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.zzsn.crawler;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.kafka.core.KafkaTemplate;
import java.util.HashMap;
import java.util.Map;
@Slf4j
public class WeixinDetailThread extends Thread{
public SiteMsgTemple siteMsgTemple =new SiteMsgTemple();
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Override
public void run() {
detailCrawler();
}
public boolean detailCrawler(){
boolean flag=false;
String weixinurl = siteMsgTemple.getSiteUri();
//判断是否已爬取
try {
String urlflag = JedisUtil.getString(weixinurl);
if (!StringUtils.isEmpty(urlflag)) {
log.info("已爬取" + weixinurl);
return flag;
}
}catch (Exception e){
log.info("redis获取信息失败");
}
String weixinid=getParam(weixinurl);
log.info("爬取的微信id= "+weixinid);
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement(weixinurl);
String contentNoTag = null;
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
// System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
while (formatImgContent.contains(key)) {
//转换为绝对路径
String key2="";
if(key.contains("original")){
key2 = "original";
}else if(key.contains("data-src")){
key2 = "data-src";
}else if(key.contains("_src")){
key2 = "_src";
}else if(key.contains("src")){
key2 ="src";
}
String key3=key.replace(key2,"src");
// formatImgContent = formatImgContent.replace(key, key3);
formatImgContent = formatImgContent.replace(key, "");
}
}
extEntity.setContentWithTag(formatImgContent);
DocInfo docInfo=new DocInfo();
docInfo.setSid(Long.parseLong(siteMsgTemple.getId()));
docInfo.setSourceType("WeChat");
docInfo.setSourceaddress(weixinurl);
docInfo.setLang("zh_CN");
docInfo.setContentType("HTML");
docInfo.setSourceType("News");
docInfo.setCharset("utf-8");
docInfo.setTitle(extEntity.getTitle());
docInfo.setAuthor(extEntity.getAuthor());
docInfo.setPublishDate(extEntity.getPublishDate());
docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
StringBuffer sb = new StringBuffer();
sb.append("<html><head>");
sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
sb.append("<title></title></head><body>");
sb.append(extEntity.getContentWithTag());
sb.append("</body></html>");
contentNoTag=ContentUtility.TransferHTML2Text(sb.toString());
docInfo.setContentWithTag(sb.toString());
docInfo.setContentNoTag(contentNoTag);
docInfo.setContentImgCvtTag(sb.toString());
ObjectMapper mapper = new ObjectMapper();
try {
ClbAnsProcessitem processitem =docInfoTrans2Processitem(docInfo);
if(StringUtils.isEmpty(processitem.getTitle())|| StringUtils.isEmpty(processitem.getContent())){
System.out.println("资讯的信息不全没有发送");
}
String docjson = mapper.writeValueAsString(processitem);
kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
log.info("发送到kafka成功。");
flag=true;
//标记已爬取
JedisUtil.setString(weixinurl,"1",-1);
} catch (Exception e) {
// e.printStackTrace();
log.info("发送到kafka失败。");
}
return flag;
}
public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
clbAnsProcessitem.setSid(docInfo.getSid()+"");
clbAnsProcessitem.setTitle(docInfo.getTitle());
clbAnsProcessitem.setContent(docInfo.getContentNoTag());
clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
clbAnsProcessitem.setSummary(docInfo.getSummary());
clbAnsProcessitem.setAuthor(docInfo.getAuthor());
clbAnsProcessitem.setOrigin(docInfo.getOrigin());
clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());
return clbAnsProcessitem;
}
public static Map<String,String> parse(String url) {
Map<String,String> map=new HashMap<String,String>();
if (url == null) {
return map;
}
url = url.trim();
if (url.equals("")) {
return map;
}
String[] urlParts = url.split("\\?");
String uri = urlParts[0];
//没有参数
if (urlParts.length == 1) {
return map;
}
//有参数
String[] params = urlParts[1].split("&");
for (String param : params) {
String[] keyValue = param.split("=");
map.put(keyValue[0], keyValue[1]);
}
return map;
}
public static String getParam(String url) {
Map<String, String> map=new HashMap<String, String>();
try {
map = parse(url);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println(url);
}
return map.get("__biz");
}
}
package com.zzsn.crawler;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.awx.service.SiteService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.kafka.core.KafkaTemplate;
@Slf4j
public class WeixinSiteThread extends Thread{
public SiteMsgTemple siteMsgTemple=new SiteMsgTemple();
public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
@Override
public void run() {
crawler();
}
public synchronized void crawler(){
//发送公众号链接到手机微信
SiteService sites= SpringContextUtil.getBean(SiteService.class);
sites.sendUrlToweixin(siteMsgTemple);
try {
Thread.sleep(1000*60*2);
}catch (Exception e){
}
}
}
package com.zzsn.entity;
import lombok.Data;
import java.util.List;
@Data
public class ClbAnsProcessitem {
/**主键*/
private String id;
/**信息源id*/
private String sid;
/**团队id*/
private String tid;
/**标题*/
private String title;
/**摘要*/
private String summary;
/**关键词*/
private String keyWords;
/**正文*/
private String content;
private String contentWithtag;
/**未知*/
private String hash;
/**作者*/
private String author;
/**来源*/
private String sourceSite;
/**地址*/
private String sourceAddress;
/**未知*/
private String currentProcess;
/**类别*/
private String type;
/**未知*/
private String withTagFile;
/**发布时间*/
private String publishDate;
/**创建人*/
private String createBy;
/**创建时间*/
private String createDate;
/**编码*/
private String charset;
/**未知*/
private Integer processResult;
/**最新更新时间*/
private String lastModified;
/**组织id*/
private String orgId;
/**词*/
private String words;
/**来源*/
private String origin;
/**未知*/
private String orientation;
/**来源*/
private String fromWhere;
/**来源id*/
private String fromId;
/**来源类别*/
private String sourceType;
/**未知*/
private String featureWords;
/**下载地址*/
private String fileDownloadPath;
private String contentImgCvtTag;
/**关联地址*/
private String relatePlaces;
/**关联人*/
private String relatePerson;
/**关联组织*/
private String relateOrg;
/**事件*/
private String relateEvent;
/**时间*/
private String relateDate;
/**未知*/
private Integer relevance1;
/**未知*/
private String relevance;
/**语言*/
private String lang;
/**组织*/
private String orgs;
/**(临时处理)关联的专题id*/
private List<String> subjectIds;
}
\ No newline at end of file
package com.zzsn.entity;
import lombok.Data;
import java.io.Serializable;
import java.util.Map;
/**
* 数据接口文档
* 创建人:李东亮
* 创建时间:2016-4-6 下午3:44:17
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@Data
public class DocInfo implements Serializable{
private static final long serialVersionUID = 1L;
public String id;
private String contentType;
private Long orgId;
private Long sid;
//News:新闻,BBS:论坛,Blog:博客,MicroBlog:微博,WeChat:微信,Video:视频,Other:其他
private String sourceType;
private String lastModified;
private String charset;
private String sourceaddress;
private String lang;
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
private String fileDownLoadPath;
private Map<String,String> otherParams;
//public Long getOrgId() {
// return orgId;
//}
//public void setOrgId(Long orgId) {
// this.orgId = orgId;
//}
//public String getContentType() {
// return contentType;
//}
//public void setContentType(String contentType) {
// this.contentType = contentType;
//}
//public Long getSid() {
// return sid;
//}
//public void setSid(Long sid) {
// this.sid = sid;
//}
//public String getSourceType() {
// return sourceType;
//}
//public void setSourceType(String sourceType) {
// this.sourceType = sourceType;
//}
//public String getLastModified() {
// return lastModified;
//}
//public void setLastModified(String lastModified) {
// this.lastModified = lastModified;
//}
//public String getCharset() {
// return charset;
//}
//public void setCharset(String charset) {
// this.charset = charset;
//}
//public String getSourceaddress() {
// return sourceaddress;
//}
//public void setSourceaddress(String sourceaddress) {
// this.sourceaddress = sourceaddress;
//}
//public String getLang() {
// return lang;
//}
//public void setLang(String lang) {
// this.lang = lang;
//}
//public String getTitle() {
// return title;
//}
//public void setTitle(String title) {
// this.title = title;
//}
//public String getAuthor() {
// return author;
//}
//public void setAuthor(String author) {
// this.author = author;
//}
//public String getPublishDate() {
// return publishDate;
//}
//public void setPublishDate(String publishDate) {
// this.publishDate = publishDate;
//}
//public String getOrigin() {
// return origin;
//}
//public void setOrigin(String origin) {
// this.origin = origin;
//}
//
//public String getKeywords() {
// return keywords;
//}
//public void setKeywords(String keywords) {
// this.keywords = keywords;
//}
//public String getSummary() {
// return summary;
//}
//public void setSummary(String summary) {
// this.summary = summary;
//}
//public String getContentWithTag() {
// return contentWithTag;
//}
//public void setContentWithTag(String contentWithTag) {
// this.contentWithTag = contentWithTag;
//}
//public String getContentNoTag() {
// return contentNoTag;
//}
//public void setContentNoTag(String contentNoTag) {
// this.contentNoTag = contentNoTag;
//}
//public Map<String, String> getOtherParams() {
// return otherParams;
//}
//public void setOtherParams(Map<String, String> otherParams) {
// this.otherParams = otherParams;
//}
//public String getFileDownLoadPath() {
// return fileDownLoadPath;
//}
//public void setFileDownLoadPath(String fileDownLoadPath) {
// this.fileDownLoadPath = fileDownLoadPath;
//}
//public String getContentImgCvtTag() {
// return contentImgCvtTag;
//}
//public void setContentImgCvtTag(String contentImgCvtTag) {
// this.contentImgCvtTag = contentImgCvtTag;
//}
//
}
package com.zzsn.entity;
import lombok.Data;
import java.util.Date;
@Data
public class SiteMsgRecord {
/**信息源id*/
String infoSourceId;
/**信息源编码*/
String code;
/**本次采集数*/
Integer num;
/**本次开始采集时间*/
Date collectTime;
/**调度时间*/
Date dispatcherTime;
/**调度状态(0:失败 1:成功)*/
String dispatcherStatus;
/**信息来源(1:采集 2.调度)*/
String source;
}
package com.zzsn.entity;
import lombok.Data;
import java.io.Serializable;
import java.util.regex.Pattern;
@Data
public class SiteMsgTemple implements Serializable {
private static final long serialVersionUID = 1L;
/**主键*/
private String id;
/**信息源编码*/
private String infoSourceCode;
/**信息源名称*/
private String webSiteName;
/**栏目名称*/
private String siteName;
/**栏目地址*/
private String siteUri;
/**语种*/
private String language;
/**境外、公共、翻墙*/
private String checkedList;
/**历史数据URL*/
private String hisUriExp;
/**历史数据开始时间*/
private java.util.Date hisDateStartTime;
/**历史数据结束时间*/
private java.util.Date hisDateEndTime;
/**是否历史所有数据*/
private String ynHisDataAll;
/**网站级别*/
private String siteLevel;
/**状态*/
private Integer status;
/**列表页URL*/
private String listUrl;
/**表达式类型*/
private String listExpressionType;
/**
* 列表信息块位置
*/
private String infoBlockPosition;
/**
*抽取链接定位
*/
private String linkLocation;
/**匹配资讯的列表*/
private String informationItem;
/**匹配资讯的url*/
private String informationUrl;
/**匹配资讯标题*/
private String informationTitle;
/**匹配资讯发布时间*/
private String informationPublishDate;
/**匹配资讯来源*/
private String informationSource;
/**自定义实体*/
private Object extractInfo;
/**爬取深度*/
private Integer crawlDepth;
/**页码url*/
private String pageUrl;
/**匹配页码*/
private String matchPage;
/**开始页码*/
private Integer pageStart;
/**结束页码*/
private Integer pageEnd;
/**是否所有页*/
private String ynPageAll;
/**表达式类型*/
private String detailExpressionType;
/**详情页表URL*/
private String detailUrl;
/**匹配详情页标题*/
private String detailExpressionTitle;
/**匹配详情页时间*/
private String detailExpressionPublishDate;
/**匹配详情页来源*/
private String detailExpressionSource;
/**匹配详情页作者*/
private String detailExpressionAuthor;
/**匹配详情页摘要*/
private String detailExpressionSummary;
/**匹配详情页正文*/
private String detailExpressionContent;
/**自定义实体*/
private Object detailInfo;
/**是否下载附件*/
private String ynDownload;
/**数据表格页URL*/
private String formUrl;
/**数据表格标题*/
private String formTitle;
/**表达式类型*/
private Integer formType;
/**数据表格表达式*/
private String dataFormExpression;
/**自定义*/
private Object dataFormInfo;
/**页码URL*/
private String dataPageUrl;
/**页码规则*/
private String dataPageRule;
/**开始页码*/
private Integer dataPageStart;
/**结束页码*/
private Integer dataPageEnd;
/**是否所有页码*/
private String ynDataPageAll;
/**数据类型*/
private Integer dataType;
/**数据格式*/
private Integer dataFormat;
/**数据存储方式*/
private Integer dataStorageMode;
/**数据存储信息*/
private Object dataStorageInfo;
/**是否动态爬取*/
private Integer ynDynamicCrawl;
/**是否需要登陆*/
private Integer ynLogin;
/**登陆域名*/
private String domainName;
/**登陆链接*/
private String link;
/**登陆账号*/
private String account;
/**登陆密码*/
private String password;
/**userAgent*/
private String userAgent;
/**referer*/
private String referer;
/**cookies*/
private String cookies;
/**headers*/
private String headers;
/**其它参数*/
private String otherInfo;
/**爬虫类别*/
private Integer crawlType;
/**爬虫名称*/
private String crawlName;
/**爬虫地址*/
private String crawlAddress;
/**参数*/
private Object parameter;
/**cron表达式*/
private String cron;
//++++++++++++++++++++++++++++++++++++++++++++++++++
private Pattern pattern;
}
package com.zzsn.entity;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
/**
* 微信转换
* 创建人:李华伟
* 创建时间:2016-8-10 上午10:56:41
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
@JsonSerialize(include= JsonSerialize.Inclusion.NON_NULL)
public class Wxurl implements Cloneable {
//columns START
/**
* id db_column: ID
*/
private Long id;
/**
* rid 是否转换标识
*/
private Long flag;
/**
* 要转换的URL
*/
private String ourl;
/**
* uri 转换后url
*/
private String nurl;
/**
* category db_column: CATEGORY
*/
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public Long getFlag() {
return flag;
}
public void setFlag(Long flag) {
this.flag = flag;
}
public String getOurl() {
return ourl;
}
public void setOurl(String ourl) {
this.ourl = ourl;
}
public String getNurl() {
return nurl;
}
public void setNurl(String nurl) {
this.nurl = nurl;
}
}
package com.zzsn.extractor;
import com.zzsn.util.DateUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 获取正文中的图片或者文件
* 创建人:李东亮
* 创建时间:2016-8-30 下午5:25:04
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ContentFileFinder {
/**
* 获取父路径
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:17:44
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String getDirPath(String path) {
path = path.substring(0, path.lastIndexOf("/")) ;
return path;
}
/**
* 去除路径中的./
* 创建人: 李东亮
* 创建时间: 2015-7-6 下午3:43:00
* @version 1.0
* @param path
* @return
* @throws IOException
*/
public static String formatPath(String currentPageURL,String imgPath) {
String start="";
if(currentPageURL.indexOf("http://")!=-1){
start = "http://";
}else if(currentPageURL.indexOf("https://")!=-1){
start = "https://";
}
//绝对路径
if(imgPath.startsWith("/")){
//add lihuawei 增加双斜杠判断图片 如果开始时双斜杠就增加http:
if(imgPath.startsWith("//")){
return start+imgPath.replace("//", "");
}
currentPageURL = currentPageURL.replace(start, "");
int subIndex = currentPageURL.indexOf("/");
if(subIndex==-1){
subIndex = currentPageURL.length();
}
String domain = currentPageURL.substring(0, subIndex);
return start+domain+imgPath;
}
//相对路径
String path = currentPageURL+"/"+imgPath;
path = path.replaceAll(start, "D:/");
File f = new File(path);
String filePath="";
try {
filePath = f.getCanonicalPath();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String result = filePath.replaceAll("D:\\\\", start);
result = result.replaceAll("\\\\", "/");
return result;
}
/**
* 生成图片文件保存路径
* 创建人: 李东亮
* 创建时间: 2016-3-23 下午2:50:33
* @version 1.0
* @return
*/
private static String genImgFileName(String suffix){
String dir = DateUtil.format(new Date(), "yyyy-MM-dd");
String uuid = UUID.randomUUID().toString();
return dir+"/"+uuid+suffix;
}
/**
* 确保有src属性并且src属性指向正确的图片地址
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午1:46:03
* @version 1.0
* @param rawTag
* @return
*/
public static Element ensureSrc(Element imgTag){
// Document doc = Jsoup.parseBodyFragment(rawTag);
String firstSrcAtt=null;
if(imgTag.hasAttr("original")){
firstSrcAtt = "original";
}else if(imgTag.hasAttr("data-src")){
firstSrcAtt = "data-src";
}else if(imgTag.hasAttr("_src")){
firstSrcAtt = "_src";
}else if(imgTag.hasAttr("src")){
firstSrcAtt ="src";
}
if(firstSrcAtt==null){
return null;
}
imgTag.attr("src", imgTag.attr(firstSrcAtt));
return imgTag;
}
/**
* 获取图片的绝对路径
* 创建人: 李东亮
* 创建时间: 2016-6-6 下午2:05:02
* @version 1.0
* @param element
* @param uri
* @return
*/
private static String getAbsolutePath(Element element,String uri,String linkAtt){
String absolutePath = element.attr(linkAtt);
if(absolutePath.startsWith("data:image")){
return null;
}
if(absolutePath.startsWith("file:")){
return null;
}
if (absolutePath.matches("(?i)^javascript.*|#")) {
return null;
}
if(!absolutePath.startsWith("http://")&&!absolutePath.startsWith("https://")&&uri!=null){
String puriDir = getDirPath(uri);
absolutePath = formatPath(puriDir,absolutePath);
}
return absolutePath;
}
/**
* 获取后缀名
* 创建人: 李东亮
* 创建时间: 2016-8-30 下午5:00:39
* @version 1.0
* @param uri
* @return
*/
public static String getSuffix(String uri){
uri = uri.replaceAll("http://|https://", "");
Pattern p = Pattern.compile("/.+(\\.\\w{1,4})$");
Matcher m = p.matcher(uri);
if(m.find()){
return m.group(1);
}
return "";
}
/**
* 获取正文中的文件标签,包含正文中的图片和附件
* 创建人: 李东亮
* 创建时间: 2016-9-8 下午3:01:09
* @version 1.0
* @param content
* @param sourceaddress
* @return
*/
public static Map<String,FileTag> getContentFileTag(String content,String sourceaddress){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img,a");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml();
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = getAbsolutePath(imgTag,sourceaddress,filePathAttr);
if(absolutePath==null){
continue;
}
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
public static Map<String,FileTag> getContentFileTag(String content){
Map<String,FileTag> imgMap = new HashMap<String,FileTag>();
if(content==null||content.length()==0){
return imgMap;
}
String rawTag;
String absolutePath;
FileTag fileTag;
String savePath;
Document doc = Jsoup.parse(content);
Elements imgTags = doc.select("img,a");
Element imgTag;
String suffix = "";
String filePathAttr;
String preFixPath;
for (Iterator<Element> iterator = imgTags.iterator(); iterator.hasNext();) {
fileTag = new FileTag();
imgTag = iterator.next();
rawTag = imgTag.outerHtml();
if(imgTag.tagName().toLowerCase().equals("img")){
filePathAttr = "src";
//使src指向正确的图片显示路径
imgTag = ensureSrc(imgTag);
preFixPath="IMG_SERVER/";
}else
{
filePathAttr="href";
fileTag.setFileName(imgTag.text());
preFixPath="FILE_SERVER/";
}
//获取图片的绝对路径,并且使src指向图片的绝对路径
absolutePath = "";
imgTag.attr(filePathAttr,absolutePath);
fileTag.setAbsolutePath(absolutePath);
fileTag.setAbsoluteTag(imgTag.outerHtml());
//图片保存路径
suffix = ContentFileFinder.getSuffix(absolutePath);
savePath = genImgFileName(suffix);
fileTag.setSavePath(savePath);
//图片保存标签
imgTag.attr(filePathAttr,preFixPath+fileTag.getSavePath());
fileTag.setSaveTag(imgTag.outerHtml());
//key为图片完整路径
imgMap.put(rawTag, fileTag);
}
return imgMap;
}
public static void main(String[] args) {
String str = "<img data-src=\"http://static.tianyaui.com/img/static/2011/imgloading.gif\" title=\"点击图片查看幻灯模式\" original2=\"http://img3.laibafile.cn/p/l/246500759.jpg\" />";
System.out.println(ContentFileFinder.getSuffix("http://www.baidu.com//a.xls"));
}
}
package com.zzsn.extractor;
import java.io.File;
/**
* 需要分析抽取的实体
* 创建人:李东亮
* 创建时间:2016-4-7 下午3:20:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ExtEntity {
private String title;
private String author;
private String publishDate;
private String origin;
private String keywords;
private String summary;
private String contentWithTag;
private String contentNoTag;
private String contentImgCvtTag;
// private String html;
// private String charset;
private File file;
private String fileDownLoadPath;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPublishDate() {
return publishDate;
}
public void setPublishDate(String publishDate) {
this.publishDate = publishDate;
}
public String getOrigin() {
return origin;
}
public void setOrigin(String origin) {
this.origin = origin;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getContentWithTag() {
return contentWithTag;
}
public void setContentWithTag(String contentWithTag) {
this.contentWithTag = contentWithTag;
}
public String getContentNoTag() {
return contentNoTag;
}
public void setContentNoTag(String contentNoTag) {
this.contentNoTag = contentNoTag;
}
public String getContentImgCvtTag() {
return contentImgCvtTag;
}
public void setContentImgCvtTag(String contentImgCvtTag) {
this.contentImgCvtTag = contentImgCvtTag;
}
/* public String getHtml() {
return html;
}
public void setHtml(String html) {
this.html = html;
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}*/
public File getFile() {
return file;
}
public void setFile(File file) {
this.file = file;
}
public String getFileDownLoadPath() {
return fileDownLoadPath;
}
public void setFileDownLoadPath(String fileDownLoadPath) {
this.fileDownLoadPath = fileDownLoadPath;
}
}
package com.zzsn.extractor;
import java.io.InputStream;
/**
* web抽取类
* 创建人:李东亮
* 创建时间:2016-9-13 上午11:32:28
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public interface Extractor {
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream);
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-9-13 上午11:34:58
* @version 1.0
* @param entity
*/
public void process(ExtEntity entity) throws Exception;
/**
* 获取内容
* 创建人: 李东亮
* 创建时间: 2016-9-18 上午10:23:50
* @version 1.0
* @return
*/
public String getContent();
/**
* 获取字符集
* 创建人: 李东亮
* 创建时间: 2016-9-18 上午10:24:09
* @version 1.0
* @return
*/
public String getCharset();
}
package com.zzsn.extractor;
/**
* img传输对象
* 创建人:李东亮
* 创建时间:2015-7-6 下午4:51:51
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class FileTag {
//src路径如果为相对路径,则转换为绝对路径
private String absolutePath;
//src路径转换为绝对路径之后的标签
private String absoluteTag;
//用于JSP替换
private String saveTag;
//图片保存路径
private String savePath;
//下载文件名
private String fileName;
public String getAbsolutePath() {
return absolutePath;
}
public void setAbsolutePath(String absolutePath) {
this.absolutePath = absolutePath;
}
public String getAbsoluteTag() {
return absoluteTag;
}
public void setAbsoluteTag(String absoluteTag) {
this.absoluteTag = absoluteTag;
}
public String getSaveTag() {
return saveTag;
}
public void setSaveTag(String saveTag) {
this.saveTag = saveTag;
}
public String getSavePath() {
return savePath;
}
public void setSavePath(String savePath) {
this.savePath = savePath;
}
public String getFileName() {
return fileName;
}
public void setFileName(String fileName) {
this.fileName = fileName;
}
}
package com.zzsn.extractor;
public class TestWxDispatch {
public static void main(String[] args) {
WeiXinDispatch wx=new WeiXinDispatch();
String url="http://mp.weixin.qq.com/s?__biz=MzAwODE2OTAwNg==&mid=2652292603&idx=2&sn=ef7a35347bc9dceb79c0fa42f8840f79&chksm=809096bab7e71fac9e77e6a1521f64c58e919be1522f4a1dccdad5aa75be3a787346a46e1667&scene=27#wechat_redirect";
ExtEntity en=wx.getExtractorElement(url);
System.out.println(en.getAuthor());
System.out.println(en.getContentImgCvtTag());
System.out.println(en.getContentWithTag());
System.out.println(en.getTitle());
System.out.println(en.getPublishDate());
System.out.println(en.getContentNoTag());
}
}
package com.zzsn.extractor;
import com.zzsn.extractor.web.Processor;
import com.zzsn.util.*;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正文抽取处理类.抽取标题,摘要,正文,作者,字符集
* 创建人:李东亮
* 创建时间:2015-5-11 下午3:28:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class WebExtractorImplforweixin implements Extractor{
private static final Logger Log = LoggerFactory.getLogger(WebExtractorImplforweixin.class);
//抽取类型
public enum EXT_TYPE {
CONTENT, TITLE, KEYWORDS, SUMMARY, AUTHOR, PUBLISH_DATE, ORIGIN
}
private List<Processor> processors;
private HttpResponse getMethod;
private String html;
private String charset;
public WebExtractorImplforweixin(List<Processor> processors,HttpResponse getMethod){
this.processors = processors;
this.getMethod = getMethod;
}
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream) {
try{
html = FileUtil.readHtml(inputStream, Constants.READ_CHARSET);
Header header = getMethod.getFirstHeader("Content-Type");
charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
}catch(Exception e){
return false;
}
if(html==null){
return false;
}else{
return true;
}
}
/**
* 获取带标签正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:04:01
* @version 1.0
* @param body
* @return
*/
private String getContentWithTag(String body) {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(body);
return contentWithTag;
}
/**
* 获取不带标签的正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:06:32
* @version 1.0
* @param contentWithTag
* @return
*/
private String getContentNoTag(String contentWithTag) {
return ContentUtility.TransferHTML2Text(contentWithTag);
}
/**
* 去除html标签中的无效字符
* 创建人: 李东亮
* 创建时间: 2015-7-1 下午1:05:52
* @version 1.0
* @param html
* @return
*/
public String formatHtmlTag(String html) {
Pattern p = Pattern.compile("<[\\d|\\w|\\/]*([^(\\d|\\w|\\/)]+)[\\d|\\w|\\/]*>");
Matcher m = p.matcher(html);
String g;
while (m.find()) {
g = m.group();
html = html.replaceAll(g, g.replaceAll("[^(\\<|\\>|\\d|\\w|\\/)]+", ""));
}
return html;
}
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-4-7 下午2:00:19
* @version 1.0
* @param entity
* @return
* @throws Exception
*/
public void process(ExtEntity entity) throws Exception {
//获取字符集,并把html片段转换为正确的编码
/* Header header = getMethod.getResponseHeader("Content-Type");
String charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
if(!Constants.SHANGFEI_SUPPORT){
html = CharsetUtil.converCharsetToUTF8(html,charset);
charset=Constants.DEFAULT_CHARSET;
}
entity.setCharset(charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
entity.setHtml(html);
*/
//获取正文,标题,关键词,摘要,作者,发布时间,来源
Processor processor;
String ename;
String result;
String contentWithTag;
String temp;
for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
processor = iterator.next();
ename = processor.getExtType().getEname().toUpperCase();
//标题
if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setTitle(temp);
}
}
}
//作者
else if (ename.equals(EXT_TYPE.AUTHOR.toString())&&entity.getAuthor()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setAuthor(temp);
}
}
}
//发布时间
else if (ename.equals(EXT_TYPE.PUBLISH_DATE.toString())&&entity.getPublishDate()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setPublishDate(temp);
}
else{
if(html.contains("ar ct = \"")){
int ii=html.indexOf("var ct = \"");
temp=html.substring(html.indexOf("ar ct = \"")+9, html.indexOf("ar ct = \"")+19);
if (temp.length() > 0) {
String time= DateUtil.tiemString2String(temp+"000", true);
entity.setPublishDate(time);
}
}
}
}
}
//来源
else if (ename.equals(EXT_TYPE.ORIGIN.toString())&&entity.getOrigin()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setOrigin(temp);
}
}
}
//关键词
else if (ename.equals(EXT_TYPE.KEYWORDS.toString())&&entity.getKeywords()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setKeywords(temp);
}
}
}
//摘要
else if (ename.equals(EXT_TYPE.SUMMARY.toString())&&entity.getSummary()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setSummary(temp);
}
}
}
//带标签正文
else if (ename.equals(EXT_TYPE.CONTENT.toString())&&entity.getContentWithTag()==null) {
result = processor.extract(html);
if (result != null) {
contentWithTag = this.getContentWithTag(result);
//带标签正文
entity.setContentWithTag(contentWithTag);
}
}
}
}
public String getContent(){
return html;
}
public String getCharset(){
return charset;
}
}
package com.zzsn.extractor;
import com.zzsn.extractor.web.Processor;
import com.zzsn.extractor.web.ProcessorReader;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.FileUtil;
import org.apache.http.*;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.nodes.Element;
import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WeiXinDispatch {
public List<String> testMatchTime(String siteTimeMatch, String scopeHtml)
{
Pattern p=Pattern.compile(siteTimeMatch);
Matcher m= p.matcher(scopeHtml);
List<String> result = new ArrayList<String>();
while (m.find()) {
result.add(m.group());
}
return result;
}
private String matchtime(String matchExp,Element element){
if(matchExp==null){
return null;
}
matchExp= matchExp.trim();
if(matchExp.length()==0){
return null;
}
Element temp = element;
try{
String[] steps = matchExp.split("@");
if(steps.length!=2 ){
return null;
}
//获取元素上几级包含时间
String parant=steps[0];
/*Integer lenth=0;
try {
lenth=Integer.parseInt(parants);
} catch (Exception e) {
System.out.println("转换出错");
return null;
}*/
/* for (int i = 0; i < lenth; i++) {
temp = temp.parent();
}*/
String [] parants=parant.split("_");
for(String step : parants){
if(temp==null){
break;
}
if(step.equals("parent")){
temp = temp.parent();
}else if(step.equals("prev")){
temp = temp.previousElementSibling();
}else if(step.equals("next")){
temp = temp.nextElementSibling();
}
}
String html=temp.html();
Pattern p=Pattern.compile(steps[1]);
Matcher m= p.matcher(html);
while (m.find()) {
return m.group();
}
}catch(Exception e){
return null;
}
return null;
}
private String convertRelativePath(String currentURI, String relativePath)
{
int count = 0;
if(relativePath.matches("^\\.\\./.*"))
{
for(; relativePath.matches("^\\.\\./.*"); relativePath = relativePath.replaceFirst("^\\.\\./", ""))
count++;
count++;
} else
if(relativePath.matches("^\\./.*"))
for(; relativePath.matches("^\\./.*"); relativePath = relativePath.replaceFirst("^\\./", ""))
count++;
for(int last = currentURI.lastIndexOf("/"); last > 8 && count > 0; count--)
{
currentURI = currentURI.substring(0, last);
last = currentURI.lastIndexOf("/");
}
return (new StringBuilder(String.valueOf(currentURI))).append("/").append(relativePath).toString();
}
protected String getBaseHref(String content)
{
String contentPrefix = "";
if(contentPrefix.length() > 3000)
contentPrefix = content.substring(0, 3000);
else
contentPrefix = content;
String href = null;
Matcher matcher = Pattern.compile("(?is)<base\\s+[^>]*href\\s*=\\s*['\"]*([^\"|'|\\s]+)['\"]*[^>]*>").matcher(contentPrefix);
if(matcher.find())
href = matcher.group(1);
if(href != null)
href = href.replace(":80", "");
return href;
}
private String getBaseHrefByUrl(String url)
{
if(url.endsWith("/"))
return url;
int index = url.lastIndexOf("/");
if(index != -1 && index > 8)
url = url.substring(0, index);
return url;
}
public static CloseableHttpClient createSSLClientDefault(){
try {
SSLContext sslContext = new SSLContextBuilder().useProtocol("TLSv1.2").loadTrustMaterial(null, new TrustStrategy() {
//信任所有
@Override
public boolean isTrusted(
java.security.cert.X509Certificate[] arg0, String arg1)
throws java.security.cert.CertificateException {
// TODO Auto-generated method stub
return true;
}
}).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
return HttpClients.custom().setSSLSocketFactory(sslsf).build();
} catch (KeyManagementException e) {
e.printStackTrace();
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
} catch (KeyStoreException e) {
e.printStackTrace();
}
return HttpClients.createDefault();
}
public String testScopeTag(String siteURI, String scopetags)
{
String result = "";
Processor p = ProcessorReader.getScopeProcessor(scopetags);
String siteHtml = "";
HttpResponse re = getmethodNokey(siteURI);
try
{
siteHtml= FileUtil.readHtml(re.getEntity().getContent(), Constants.READ_CHARSET);;
}
catch(Exception e)
{
e.printStackTrace();
}
if(p == null)
result = siteHtml;
else
try
{
result = p.extract(siteHtml);
}
catch(Exception e)
{
result = siteHtml;
}
result = ContentUtility.convertHtmlToUtf8(result,re.getFirstHeader("Content-Type"));
return result;
}
public HttpResponse getmethodNokey(String url){
CloseableHttpClient httpClient = null;
httpClient = createSSLClientDefault();
// HttpClient httpClient = new HttpClient();
HttpGet httpGet = null;
httpGet = null;
CloseableHttpResponse getMethod1 = null;
HttpResponse getMethod = null;
HttpClient client = null;
httpGet = new HttpGet(url);// Get请求
// httpGet.getParams().setIntParameter(
// CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
// httpGet.getParams().setParameter(HttpMethodParams.SO_TIMEOUT,
// 60000);
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(500000).setConnectionRequestTimeout(100000).setSocketTimeout(500000).build();
httpGet.setConfig(requestConfig);
// 伪装成浏览器
httpGet.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpGet.setHeader(HttpHeaders.CONNECTION, "close");
try {
getMethod1 = httpClient.execute(httpGet);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // 发送请求
getMethod = getMethod1;
return getMethod;
}
public ExtEntity getExtractorElement(String url)
{
HttpResponse getMethod = getmethodNokey(url);
List processors = ProcessorReader.readWeChatProcessors();
String contentType = ContentUtility.getContentType(getMethod);
Extractor extractor = null;
ExtEntity entity = new ExtEntity();
if(contentType.equals("HTML")){
extractor = new WebExtractorImplforweixin(processors, getMethod);
}else{
return null;
}
try
{
extractor.readEntity(url, getMethod.getEntity().getContent());
extractor.process(entity);
}
catch(IOException e)
{
e.printStackTrace();
}
catch(Exception e)
{
e.printStackTrace();
}
return entity;
}
public HttpResponse gettarget(String url){
CloseableHttpClient httpClient = null;
httpClient = createSSLClientDefault();
HttpContext httpContext = new BasicHttpContext();
// HttpClient httpClient = new HttpClient();
HttpGet httpGet = null;
httpGet = null;
CloseableHttpResponse getMethod1 = null;
HttpResponse getMethod = null;
HttpClient client = null;
httpGet = new HttpGet(url);// Get请求
// httpGet.getParams().setIntParameter(
// CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
// httpGet.getParams().setParameter(HttpMethodParams.SO_TIMEOUT,
// 60000);
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(500000).setConnectionRequestTimeout(100000).setSocketTimeout(500000).build();
httpGet.setConfig(requestConfig);
// 伪装成浏览器
httpGet.setHeader("Content-Type",
"application/x-www-form-urlencoded;charset=utf-8");
httpGet.setHeader(HttpHeaders.CONNECTION, "close");
try {
getMethod1 = httpClient.execute(httpGet,httpContext);
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} // 发送请求
getMethod = getMethod1;
//HttpHost targetHost = (HttpHost)httpContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
HttpUriRequest realRequest = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);
//System.out.println("主机地址:" + targetHost);
System.out.println("URI信息:" + realRequest.getURI());
HttpEntity entity = getMethod.getEntity();
if(null != entity){
try {
System.out.println("响应内容:" + EntityUtils.toString(entity, ContentType.getOrDefault(entity).getCharset()));
} catch (UnsupportedCharsetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
EntityUtils.consume(entity);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return getMethod;
}
public static void getRedirectInfo(){
HttpClient httpClient = new DefaultHttpClient();
HttpContext httpContext = new BasicHttpContext();
HttpGet httpGet = new HttpGet("http://m.toutiao.com/group/6793100121849463308/");
try {
//将HttpContext对象作为参数传给execute()方法,则HttpClient会把请求响应交互过程中的状态信息存储在HttpContext中
HttpResponse response = httpClient.execute(httpGet, httpContext);
//获取重定向之后的主机地址信息,即"http://127.0.0.1:8088"
HttpHost targetHost = (HttpHost)httpContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
//获取实际的请求对象的URI,即重定向之后的"/blog/admin/login.jsp"
HttpUriRequest realRequest = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);
System.out.println("主机地址:" + targetHost);
System.out.println("URI信息:" + realRequest.getURI());
HttpEntity entity = response.getEntity();
if(null != entity){
System.out.println("响应内容:" + EntityUtils.toString(entity, ContentType.getOrDefault(entity).getCharset()));
EntityUtils.consume(entity);
}
} catch (Exception e) {
e.printStackTrace();
}finally{
httpClient.getConnectionManager().shutdown();
}
}
public static void main(String[] args) {
// getRedirectInfo();
//HttpResponse s=new WeiXinDispatch().gettarget("http://m.toutiao.com/group/6793100121849463308/");
HttpResponse s=new WeiXinDispatch().getmethodNokey("http://www.toutiao.com/a6793100121849463308/");
try {
String content=FileUtil.readHtml(s.getEntity().getContent(),Constants.READ_CHARSET);
System.out.println(content);
} catch (IllegalStateException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/*
* String patt = "\\d{4}\\-\\d{1,2}\\-\\d{1,2}"; String
* html="sdf s搭建手动2018-01-30 2018-01-30 2018-01-30 2018-01-30"; Pattern
* p=Pattern.compile(patt); Matcher m=p.matcher(html); while (m.find()) {
* System.out.println(1); System.out.println(m.group(0)); }
*
*
* String testStr = "2018-01-30 1111"; Pattern p1 =
* Pattern.compile("\\d{4}\\-\\d{1,2}\\-\\d{1,2}"); Matcher m1 =
* p1.matcher(testStr);
*
* while (m1.find()) { System.out.println(1); System.out.println(m1.group(0)); }
*
* Pattern pattern = Pattern.compile("\\w+"); Matcher matcher =
* pattern.matcher("hello abc bbc cbc ccc");
*
* //find向前迭代 while(matcher.find()){ System.out.println(matcher.group()); }
*
* Pattern pattern1 = Pattern.compile("(\\w+)\\s\\d+"); Matcher matcher1 =
* pattern1.matcher("hello 123 abc bbc cbc ccc"); matcher1.find();
* System.out.println(22222); System.out.println(matcher1.group(0));
*
* while(matcher.find()){ System.out.println(matcher.group()); }
*
* Pattern pattern2 = Pattern.compile("(\\w+)\\d+"); Matcher matcher2 =
* pattern2.matcher("hello123 abc bbc cbc ccc"); matcher2.find();
* System.out.println(matcher2.groupCount());
*/
}
}
package com.zzsn.extractor.web;
import java.util.ArrayList;
import java.util.List;
/**
* 默认返回对象设置
* 创建人:李东亮
* 创建时间:2015-5-13 上午10:23:02
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class DefaultMsg {
private int success = 1;
private List<String> errors = new ArrayList<String>();
public int getSuccess() {
return success;
}
public void setSuccess(int success) {
this.success = success;
}
public List<String> getErrors() {
return errors;
}
public void setErrors(List<String> errors) {
this.errors = errors;
}
}
package com.zzsn.extractor.web;
import java.io.Serializable;
/**
* 抽取内容
* 创建人:李东亮
* 创建时间:2015-5-18 下午4:39:22
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ExtType implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
//抽取类型名称
private String ename;
//抽取类型表达式
private String exp;
//需要减去的标签
private String subtraction;
//此值不为空就取所选标签的attr属性值作为返回结果
private String attr;
public String getSubtraction() {
return subtraction;
}
public void setSubtraction(String subtraction) {
this.subtraction = subtraction;
}
public String getExp() {
return exp;
}
public void setExp(String exp) {
this.exp = exp;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getAttr() {
return attr;
}
public void setAttr(String attr) {
this.attr = attr;
}
}
package com.zzsn.extractor.web;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FileNotFoundException;
import java.io.Serializable;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 基于标签的抽取实现(Jsoup实现)
* 创建人:李东亮
* 创建时间:2015-5-17 下午3:07:01
* 公司 :郑州数能软件科技有限公司
* @version 1.0 t
*
*/
public class JsoupTagProcessor implements Processor, Serializable {
private static final Logger Log = LoggerFactory.getLogger(JsoupTagProcessor.class);
/**
*
*/
private static final long serialVersionUID = 1L;
//定位标签
private String[] nestTagsArray;
//定位标签
private String[] nestTags;
//需要
private List<String> endTagNames = new ArrayList<String>();
private List<Map<String,String>> endTagAttrs = new ArrayList<Map<String,String>>();
private String[] removeTags;
private ExtType extType;
private String attr;
private Pattern regPattern = null;
private boolean isReg = false;
public JsoupTagProcessor(ExtType extType){
this.extType = extType;
}
/**
* 格式化attr
* 创建人: 李东亮
* 创建时间: 2016-3-31 上午10:22:32
* @version 1.0
* @param attr
* @return
*/
private String formatAttr(String attr){
attr = attr.replaceAll("'|\"|‘|“|”", "");
attr = attr.trim();
attr = attr.replaceAll("[\\s| ]+", " ");
return attr;
}
/**
* 配置通配符
* 创建人: 李东亮
* 创建时间: 2015-6-1 下午2:00:48
* @version 1.0
* @param exp
*/
private void configEndTag(String exp){
String[] array = exp.split("\\.\\*\\.");
if(array.length==1){
return;
}
String firstTag;
for(int i = 0 ; i <array.length;i++){
if(i!=0){
firstTag = array[i].split("\\.")[0];
this.endTagNames.add(firstTag);
this.endTagAttrs.add(this.getAttrs(firstTag));
}
}
}
/**
* 获取正则
* 创建人: 李东亮
* 创建时间: 2016-8-12 下午1:57:56
* @version 1.0
* @param regExp
* @return
*/
private Pattern getRegPattern(String regExp){
Pattern result = null;
Pattern p = Pattern.compile("^(?i)REG\\[(.+)\\]$");
Matcher m = p.matcher(regExp);
String exp = null;
if(m.find()){
exp = m.group(1);
//如果不包含括号
if(!exp.matches(".*\\(.+\\).*")){
return result;
}
try{
result = Pattern.compile(exp);
}catch(Exception e){
result = null;
Log.error(regExp+"中括号中的内容不是有效的正则表达式");
}
}
return result;
}
public DefaultMsg init() {
// TODO Auto-generated method stub
DefaultMsg dm = new DefaultMsg();
/* if(!exp.matches("^\\w+\\[\\d+\\](\\.\\w+\\[\\d+\\])*")){
return null;
}
*/
String exp = extType.getExp();
exp=exp.trim();
if(exp.toUpperCase().startsWith("REG")){
isReg = true;
regPattern = this.getRegPattern(exp);
return dm;
}
nestTagsArray= exp.split("\\|");
if (nestTagsArray!=null&&nestTagsArray.length>0){
for (int i=0;i<nestTagsArray.length;i++) {
if(nestTagsArray[i].startsWith("*")){
nestTagsArray[i]="HTML[1]."+nestTagsArray[i];
}
configEndTag(nestTagsArray[i]);
}
}
// if(exp.startsWith("*")){
// exp="HTML[1]."+exp;
// }
// configEndTag(exp);
// nestTagsArray= exp.split("\\|");
String subtraction = extType.getSubtraction();
if(subtraction!=null&&subtraction.length()>0){
removeTags = subtraction.split("\\,");
}
attr = extType.getAttr();
return dm;
}
public ExtType getExtType() {
// TODO Auto-generated method stub
return this.extType;
}
public String test(String html,String charset) {
Document node = Jsoup.parse(html, charset);
//去除子标签
if(removeTags!=null){
for(String removeTag : removeTags){
List<Node> removeNodes = this.getRemoveTags(node, removeTag);
if(removeNodes!=null){
for (Iterator<Node> iterator = removeNodes.iterator(); iterator.hasNext();) {
iterator.next().remove();
}
}
}}
return node.outerHtml();
}
public String extract(String html) {
Document doc = Jsoup.parse(html);
//如果是exp表达式
if(isReg){
if(regPattern!=null){
String result = "";
Matcher matcher = regPattern.matcher(doc.outerHtml());
if(matcher.find()){
try{
result = matcher.group(1);
}catch(Exception e){
return result;
}
return result;
}
}
return "";
}
/***************获取标签开始********************/
Node node=null;
if (nestTagsArray!=null&&nestTagsArray.length>0){
for (int i=0;i<nestTagsArray.length;i++) {
nestTags = nestTagsArray[i].split("\\.");
if(nestTags!=null){
node = resuChild(nestTags,doc,0,-1);
}else
{
node= doc;//全文
}
if(node!=null){
break;
}
}
if (node==null) {
return "";
}
} else {
return "";
}
/***************获取标签结束********************/
/***************后续处理开始********************/
//获取属性值
if(this.attr!=null){
return node.attr(attr);
}
//去除子标签
// String result = node.outerHtml();
if(removeTags!=null){
for(String removeTag : removeTags){
List<Node> removeNodes = this.getRemoveTags(node, removeTag);
if(removeNodes!=null){
for (Iterator<Node> iterator = removeNodes.iterator(); iterator.hasNext();) {
// result.replace(iterator.next().outerHtml(), "");
iterator.next().remove();
}
}
}
}
/***************后续处理结束********************/
return node.outerHtml();
}
/**
* 获取要去除的标签
* 创建人: 李东亮
* 创建时间: 2015-5-29 下午4:02:54
* @version 1.0
* @param node
* @param removeTag
* @return
*/
private List<Node> getRemoveTags(Node node,String removeTag){
return this.findNodes(node, removeTag, this.getAttrs(removeTag));
}
/**
* 获取中括号中的所有属性
* 创建人: 李东亮
* 创建时间: 2015-5-27 下午5:53:04
* @version 1.0
* @param tagsItem
* @return
*/
public Map<String,String> getAttrs(String tagsItem){
Map<String,String> map = new HashMap<String,String>();
Pattern p = Pattern.compile("\\[.*?\\]");
Matcher m = p.matcher(tagsItem);
String[] array ;
String key;
String value;
while(m.find()){
array = m.group().replace("[", "").replace("]","").split("=");
if(array.length==2){
key = array[0].trim();
value = this.formatAttr(array[1]);
if(key.length()>0&&value.length()>0){
map.put(key, value);
}
}
}
return map;
}
/**
* 获取中括号中的所有属性
* 创建人: 李东亮
* 创建时间: 2015-5-27 下午5:53:04
* @version 1.0
* @param htmlTag
* @return
*/
public Integer getPos(String tagsItem){
Integer pos = null;
Pattern p = Pattern.compile("\\[\\d+\\]");
Matcher m = p.matcher(tagsItem);
while(m.find()){
pos = Integer.valueOf(m.group().replace("[", "").replace("]",""));
break;
}
return pos;
}
/**
* 判断节点属性是否匹配,如果属性为空,默认匹配
* 创建人: 李东亮
* 创建时间: 2015-5-27 下午6:23:52
* @version 1.0
* @param tagNode
* @param attrs
* @return
*/
private boolean match(Node tagNode,Map<String,String> attrs){
boolean match = true;
Set<String> keys = attrs.keySet();
String key ;
String value;
for (Iterator<String> iterator = keys.iterator(); iterator.hasNext();) {
key = iterator.next();
value = tagNode.attr(key);
if(value == null||!attrs.get(key).equals(value)){
match = false;
break;
}
}
return match;
}
/**
* 获取html标签名称
* 创建人: 李东亮
* 创建时间: 2015-5-29 上午10:12:23
* @version 1.0
* @param tagsItem
* @return
*/
private String getTagName(String tagsItem){
String result;
int index = tagsItem.indexOf("[");
if(index == -1){
result = tagsItem.toUpperCase();
}else
{
result = tagsItem.substring(0,tagsItem.indexOf("[")).toUpperCase();
}
return result;
}
/**
* 匹配子元素
* 创建人: 李东亮
* 创建时间: 2015-7-1 上午10:05:53
* @version 1.0
* @param node
* @param exp
* @return
*/
private Elements selectElement(Node node,String tagName,Map<String,String> attrs){
Elements elements =new Elements();
if(node instanceof Element){
String attValue ;
Element region = (Element)node;
Elements children = region.children();
Set<String> keys = attrs.keySet();
boolean shouldAdd ;
for(Element child : children){
shouldAdd = true;
if(child.tagName().toUpperCase().equals(tagName.toUpperCase())){
for(String key : keys){
if(!child.hasAttr(key)){
shouldAdd = false;
break;
}
//文档中找到的属性
String childAttr= this.formatAttr(child.attr(key));
attValue = attrs.get(key);
if(!childAttr.equals(attValue)){
shouldAdd = false;
break;
}
}
}else
{
shouldAdd =false;
}
if(shouldAdd){
elements.add(child);
}
}
}
return elements;
}
/**
* 查找parent元素的直接匹配子元素,直观上理解是距离top元素最近的匹配元素
* 创建人: 李东亮
* 创建时间: 2015-6-29 下午6:23:28
* @version 1.0
* @param parent
* @param exp
* @return
*/
private Elements getDirectNodes(Node parent,String tagName,Map<String,String> attrs){
Elements elements = selectElement(parent,tagName,attrs);
List<Node> children = parent.childNodes();
if(elements ==null||elements.size()>0){
return elements;
}else if(children.size()==0){
return null;
}else
{
for(Iterator<Node> iterator = children.iterator();iterator.hasNext();){
//此处递归
Elements childrenList = getDirectNodes(iterator.next(),tagName,attrs);
if(childrenList!=null&&childrenList.size()>0){
return childrenList;
}
}
}
return null;
}
/**
* 查询匹配的子节点
* 创建人: 李东亮
* 创建时间: 2015-6-27 下午3:59:00
* @version 1.0
* @param parent
* @param tag
* @param attrs
* @return
*/
private List<Node> findNodes(Node parent,String tag,Map<String,String> attrs){
List<Node> result = new ArrayList<Node>();
/* StringBuffer sb = new StringBuffer();
Set<String> keys = attrs.keySet();
String key;
for (Iterator<String> iterator = keys.iterator(); iterator.hasNext();) {
key = iterator.next();
sb.append("["+key+"="+attrs.get(key)+"]");
}*/
String tagName = this.getTagName(tag);
Elements es = getDirectNodes(parent,tagName,attrs);
if(es==null){
return result;
}
for(Iterator<Element> iterator = es.iterator();iterator.hasNext();){
result.add(iterator.next());
}
//判断如果有序号
Integer tagIndex = this.getPos(tag);
if(tagIndex!=null&&es.size()>=tagIndex){
List<Node> singleList = new ArrayList<Node>();
singleList.add(es.get((tagIndex-1)<0?0:(tagIndex-1)));
return singleList;
}
return result;
}
/**
* 多级搜索
* 创建人: 李东亮
* 创建时间: 2015-6-1 下午3:49:22
* @version 1.0
* @param parent
* @param endTagIndex
* @param nestTagindex
* @return
*/
private Node resuMultiple(String[] nestTags,Node parent,int endTagIndex,int nestTagindex){
endTagIndex ++;
if (endTagNames!=null && endTagNames.size()>0) {
for (int i=endTagIndex;i<endTagNames.size();i++) {
List<Node> finds = this.findNodes(parent, endTagNames.get(i), endTagAttrs.get(i));
if(finds.size()>0){
Node find = finds.get(0);
nestTagindex = nestTagindex + 2;
if(nestTagindex>=nestTags.length){//如果后面已经没有元素
return find;
}else
{
return resuChild(nestTags,find,nestTagindex,endTagIndex);
}
}
}
}
return null;
}
/**
* 单级搜索
* 创建人: 李东亮
* 创建时间: 2015-6-1 下午3:51:54
* @version 1.0
* @param parent
* @param endTagIndex
* @param nestTagindex
* @param nestTag
* @return
*/
private Node resuSingle(String[] nestTags,Node parent,int endTagIndex,int nestTagindex,String nestTag ){
String tagName = this.getTagName(nestTag);
Integer pos = this.getPos(nestTag);
Map<String,String> attrs = this.getAttrs(nestTag);
Node node;
Integer count = 0 ;
List<Node> list = parent.childNodes();
if(list==null){
return null;
}
for (int i = 0; i < list.size(); i++) {
node = list.get(i);
if(node.nodeName().toUpperCase().equals(tagName)){
count++;
if( ( pos==null||count.equals(pos) ) && this.match(node, attrs) ) {//如果匹配
if( nestTagindex!=nestTags.length-1){//没有到达最后位置继续判断下一个
nestTagindex ++;
return resuChild(nestTags,node,nestTagindex,endTagIndex);
}else
{
return node;
}
}
}
}
return null;
}
/**
* 递归查询正文内容,一直迭代到查询到为止
* 创建人: 李东亮
* 创建时间: 2015-5-18 上午11:49:55
* @version 1.0
* @param tags
* @param parent
* @param index
* @return
*/
private Node resuChild(String[] nestTags,Node parent,int nestTagindex,Integer endTagIndex ) {
if(parent == null){
return null;
}
String nestTag = nestTags[nestTagindex];
if(nestTag.equals("*")){
return this.resuMultiple(nestTags,parent, endTagIndex, nestTagindex);
}
else
{
return this.resuSingle(nestTags,parent, endTagIndex, nestTagindex, nestTag);
}
}
public static void main(String[] args) throws FileNotFoundException {
}
}
package com.zzsn.extractor.web;
/**
* 正文抽取方法
* 创建人:李东亮
* 创建时间:2015-5-17 下午3:06:21
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public interface Processor {
/**
* 抽取正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午2:11:21
* @version 1.0
* @param html
* @return
*/
public String extract(String html);
/**
* 初始化
* 创建人: 李东亮
* 创建时间: 2015-5-29 下午3:38:11
* @version 1.0
* @return
*/
public DefaultMsg init();
/**
* 获取exttype
* 创建人: 李东亮
* 创建时间: 2015-5-29 下午3:36:17
* @version 1.0
* @return
*/
public ExtType getExtType();
}
package com.zzsn.extractor.web;
import com.zzsn.util.Constants;
import com.zzsn.util.FileUtil;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
/**
* xml操作工具类
* 创建人:李东亮
* 创建时间:2015-5-27 下午2:08:13
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class ProcessorReader {
private static final Logger Log = LoggerFactory.getLogger(ProcessorReader.class);
private String path=this.getClass().getResource("/").getPath()+"conf/wechat-processor.templete";
/**
* 设置范围processor
* 创建人: 李东亮
* 创建时间: 2016-8-16 上午11:45:56
* @version 1.0
* @param scopeTags
* @return
*/
public static Processor getScopeProcessor(String scopeTags){
if(scopeTags==null||scopeTags.length()==0){
return null;
}
ExtType extType = new ExtType();
extType.setEname("SCOPE");
extType.setExp(scopeTags);
Processor processor = new JsoupTagProcessor(extType);
processor.init();
return processor;
}
/**
* 从模板中读取抽取类型和抽取公式
* 创建人: 李东亮
* 创建时间: 2015-5-27 下午3:14:08
* @version 1.0
* @param templete
* @return
*/
public static List<Processor> readProcessors(String templete){
Log.debug("===读取配置文件开始============");
List<Processor> processors = new ArrayList<Processor>();
SAXBuilder builder = new SAXBuilder();
org.jdom.Document doc = null;
try {
doc = builder.build( new StringReader(templete));
} catch (JDOMException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Element root = doc.getRootElement();
Processor processor;
ExtType extType;
Element extTypeEle;
Element subtractionEle;
Element attrEle;
String attr;
String ename;
for(WebExtractorImpl.EXT_TYPE extTypeStr : WebExtractorImpl.EXT_TYPE.values()){
ename = extTypeStr.toString().toLowerCase();
extTypeEle = root.getChild(ename);
if(extTypeEle==null){
continue;
}
extType = new ExtType();
extType.setEname(ename);
extType.setExp(extTypeEle.getChild("exp").getText().trim());
subtractionEle = extTypeEle.getChild("subtraction");
if(subtractionEle!=null){
extType.setSubtraction(subtractionEle.getText().trim());
}
attrEle = extTypeEle.getChild("attr");
if(attrEle!=null){
attr = attrEle.getText();
if(attr!=null){
attr = attr.trim();
if(attr.length()>0){
extType.setAttr(attr);
}
}
}
processor = new JsoupTagProcessor(extType);
processor.init();
Log.debug("=======");
Log.debug(extType.getEname()+":");
Log.debug("exp["+extType.getExp()+"]");
Log.debug("subtraction["+extType.getSubtraction()+"]");
Log.debug("attr["+extType.getAttr()+"]");
Log.debug("=======");
processors.add(processor);
}
Log.debug("===读取配置文件结束,共有"+processors.size()+"项需要爬取============");
return processors;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2016-5-23 下午3:24:32
* @version 1.0
* @return
*/
public static List<Processor> readWeChatProcessors(){
// String path = ProcessorReader.class.getClass().getResource("/").getPath()+"conf/wechat-processor.templete";
// String path=System.getProperty("user.dir")+"/conf/wechat-processor.templete";
// String templete = FileUtil.readFile(new File(path),"UTF-8");
String templete = FileUtil.readFile(new File(Constants.path),"UTF-8");
return readProcessors(templete);
}
public static void main(String[] args) {
}
}
package com.zzsn.extractor.web;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.Extractor;
import com.zzsn.util.CharsetUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.FileUtil;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正文抽取处理类.抽取标题,摘要,正文,作者,字符集
* 创建人:李东亮
* 创建时间:2015-5-11 下午3:28:12
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class WebExtractorImpl implements Extractor {
private static final Logger Log = LoggerFactory.getLogger(WebExtractorImpl.class);
//抽取类型
public enum EXT_TYPE {
CONTENT, TITLE, KEYWORDS, SUMMARY, AUTHOR, PUBLISH_DATE, ORIGIN
}
private List<Processor> processors;
private HttpResponse getMethod;
private String html;
private String charset;
public WebExtractorImpl(List<Processor> processors,HttpResponse getMethod){
this.processors = processors;
this.getMethod = getMethod;
}
/**
* 从inputstream中读取内容
* 创建人: 李东亮
* 创建时间: 2016-8-25 下午4:17:02
* @version 1.0
* @return
* @throws Exception
*/
public boolean readEntity(String url,InputStream inputStream) {
try{
html = FileUtil.readHtml(inputStream, Constants.READ_CHARSET);
Header header = getMethod.getFirstHeader("Content-Type");
charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
}catch(Exception e){
return false;
}
if(html==null){
return false;
}else{
return true;
}
}
/**
* 获取带标签正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:04:01
* @version 1.0
* @param body
* @return
*/
private String getContentWithTag(String body) {
String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(body);
return contentWithTag;
}
/**
* 获取不带标签的正文
* 创建人: 李东亮
* 创建时间: 2015-5-28 下午3:06:32
* @version 1.0
* @param contentWithTag
* @return
*/
private String getContentNoTag(String contentWithTag) {
return ContentUtility.TransferHTML2Text(contentWithTag);
}
/**
* 去除html标签中的无效字符
* 创建人: 李东亮
* 创建时间: 2015-7-1 下午1:05:52
* @version 1.0
* @param html
* @return
*/
public String formatHtmlTag(String html) {
Pattern p = Pattern.compile("<[\\d|\\w|\\/]*([^(\\d|\\w|\\/)]+)[\\d|\\w|\\/]*>");
Matcher m = p.matcher(html);
String g;
while (m.find()) {
g = m.group();
html = html.replaceAll(g, g.replaceAll("[^(\\<|\\>|\\d|\\w|\\/)]+", ""));
}
return html;
}
/**
* 抽取
* 创建人: 李东亮
* 创建时间: 2016-4-7 下午2:00:19
* @version 1.0
* @param curi
* @return
* @throws Exception
*/
public void process(ExtEntity entity) throws Exception {
//获取字符集,并把html片段转换为正确的编码
/* Header header = getMethod.getResponseHeader("Content-Type");
String charset = CharsetUtil.getCharset(html,header);
html = CharsetUtil.convertCorrectCharset(html,charset);
if(!Constants.SHANGFEI_SUPPORT){
html = CharsetUtil.converCharsetToUTF8(html,charset);
charset=Constants.DEFAULT_CHARSET;
}
entity.setCharset(charset);
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
entity.setHtml(html);
*/
//获取正文,标题,关键词,摘要,作者,发布时间,来源
Processor processor;
String ename;
String result;
String contentWithTag;
String temp;
for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
processor = iterator.next();
ename = processor.getExtType().getEname().toUpperCase();
//标题
if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setTitle(temp);
}
}
}
//作者
else if (ename.equals(EXT_TYPE.AUTHOR.toString())&&entity.getAuthor()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setAuthor(temp);
}
}
}
//发布时间
else if (ename.equals(EXT_TYPE.PUBLISH_DATE.toString())&&entity.getPublishDate()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setPublishDate(temp);
}
}
}
//来源
else if (ename.equals(EXT_TYPE.ORIGIN.toString())&&entity.getOrigin()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setOrigin(temp);
}
}
}
//关键词
else if (ename.equals(EXT_TYPE.KEYWORDS.toString())&&entity.getKeywords()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setKeywords(temp);
}
}
}
//摘要
else if (ename.equals(EXT_TYPE.SUMMARY.toString())&&entity.getSummary()==null) {
result = processor.extract(html);
if (result != null) {
temp = this.getContentNoTag(result);
if (temp.length() > 0) {
entity.setSummary(temp);
}
}
}
//带标签正文
else if (ename.equals(EXT_TYPE.CONTENT.toString())&&entity.getContentWithTag()==null) {
result = processor.extract(html);
if (result != null) {
contentWithTag = this.getContentWithTag(result);
//带标签正文
entity.setContentWithTag(contentWithTag);
}
}
}
}
public String getContent(){
return html;
}
public String getCharset(){
return charset;
}
}
<?xml version="1.0" encoding="UTF-8"?>
<template><content><exp>*.div[id="js_content"]</exp></content><title><exp>*.h2[class="rich_media_title"]</exp></title><author><exp>*.a[id="js_name"]</exp></author><publish_date><exp>*.em[id="post-date"]</exp></publish_date></template>
package com.zzsn.job;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
/**
* 阻塞线程池
* 线程池的线程数到达最大线程数阻塞等待
* 可用于多线程获取MQ消息任务
* 因为会阻塞,就不用考虑拒绝策略这一块的重写
*/
public class BlockThreadPoolExecute extends ThreadPoolExecutor {
private ReentrantLock lock = new ReentrantLock();
private Condition condition = this.lock.newCondition();
public BlockThreadPoolExecute(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
@Override
public void execute(Runnable command) {
//进行同步锁定
this.lock.lock();
super.execute(command);
try {
//如果线程池的数量已经达到最大线程池的数量,则进行挂起操作
if (getPoolSize() == getMaximumPoolSize()) {
this.condition.await();
}
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
this.lock.unlock();
}
}
@Override
protected void afterExecute(Runnable r, Throwable t) {
try{
lock.lock();
this.condition.signal();
}finally {
this.lock.unlock();
}
}
}
\ No newline at end of file
package com.zzsn.job;
import com.zzsn.util.Constants;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.List;
import java.util.Set;
public class JedisUtil {
private static final String PREFIX = "weixin_";
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
private JedisUtil() {
}
/**
* 从jedis连接池中获取获取jedis对象
*
* @return
*/
private static void init(){
String host = Constants.REDIS_LOCALHOST;
String port = Constants.REDIS_PORT;
String pass = Constants.REDIS_PASS;
String timeout = Constants.REDIS_TIMEOUT;
String maxIdle = Constants.REDIS_MAXIDLE;
String maxTotal = Constants.REDIS_MAXTOTAL;
String maxWaitMillis = Constants.REDIS_MAXWAITMILLIS;
String testOnBorrow = Constants.REDIS_TESTONBORROW;
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
private static Jedis getJedis() {
init();
return jedisPool.getResource();
}
private static final JedisUtil jedisUtil = new JedisUtil();
/**
* 获取JedisUtil实例
*
* @return
*/
public static JedisUtil getInstance() {
return jedisUtil;
}
public static void returnResource(final Jedis jedis) {
if (jedis != null && jedisPool != null) {
jedis.close();
}
}
public static Jedis getDefaultJedis() {
// return getJedis(HOST_IP, HOST_PORT);//简装版
return getJedis();
}
/**
* 根据 pattern 获取 redis 中的键
*/
public static Set<String> getKeysByPattern(String pattern) {
return getDefaultJedis().keys(pattern);
}
public static boolean exists(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
throw new Exception("key is null");
}
return getDefaultJedis().exists(PREFIX + key);
}
public static void del(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().del(PREFIX + key);
}
public static void setString(String key, String value, int expireTime) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
String finalKey = PREFIX + key;
getDefaultJedis().set(finalKey, value);
if (expireTime > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
getDefaultJedis().expire(finalKey, expireTime);
}else{
getDefaultJedis().expire(finalKey, 60*60*24*15);
}
}
public static String getString(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().get(PREFIX + key);
}
public static long setnx(String key, String value) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().setnx(PREFIX + key, value);
}
public static long expire(String key, int seconds) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().expire(PREFIX + key, seconds);
}
public static void pushList(String key, String value, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
/**
* key代表的是链表的名字 List是一个双端链表,lpush是往链表的头部插入一条数据,rpush是往尾部插入一条数据
*/
if (flag.equalsIgnoreCase("L")) {
getDefaultJedis().lpush(PREFIX + key, value);
} else if (flag.equalsIgnoreCase("R")) {
getDefaultJedis().rpush(PREFIX + key, value);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
public static String popList(String key, String flag) throws Exception {
if (StringUtils.isEmpty(key) || StringUtils.isEmpty(flag)) {
logger.error("key or flag is null");
throw new Exception("key or flag is null");
}
if (flag.equalsIgnoreCase("L")) {
return getDefaultJedis().lpop(PREFIX + key);
} else if (flag.equalsIgnoreCase("R")) {
return getDefaultJedis().rpop(PREFIX + key);
} else {
logger.error("unknown flag");
throw new Exception("unknown flag");
}
}
/**
* 获取 List 中指定区间上的元素
*/
public static List<String> getAppointedList(String key, long start, long end) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, start, end);
}
public static List<String> getList(String key) throws Exception {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().lrange(PREFIX + key, 0, -1);
}
public static void sadd(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
getDefaultJedis().sadd(key,value);
}
public static boolean sismember(String key,String value)throws Exception{
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
throw new Exception("key is null");
}
return getDefaultJedis().sismember(key,value);
}
}
package com.zzsn.job;
import com.alibaba.fastjson.JSON;
import com.zzsn.crawler.WeixinDetailThread;
import com.zzsn.crawler.WeixinSiteThread;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.util.Constants;
import lombok.extern.slf4j.Slf4j;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Arrays;
import java.util.Properties;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@Component
@EnableScheduling
@Slf4j
public class KafkaConsumerJob {
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
/**
* 从kafka中获取公众号信息进行发送获取列表内容提取链接
*/
@Scheduled(cron = "0 0/3 * * * ?")
@Async("asyncTaskExecutor")
public void wxOfficialConsumer (){
ExecutorService threadPool = Executors.newSingleThreadExecutor();
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class);
WeixinSiteThread siteThread=new WeixinSiteThread();
siteThread.siteMsgTemple=siteMsgTemple;
//创建使用固定线程数的线程池
threadPool.execute(siteThread);
TimeUnit.SECONDS.sleep(20);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_CONSUMER_TOPIC));
}
threadPool.shutdown();
while(true)
{
boolean isfinished = threadPool.isTerminated();
if(isfinished)
break;
}
}
ExecutorService serviceDetail = Executors.newFixedThreadPool(1);
/**
* 从kafka中获取微信资讯url进行解析详情
*/
// @Scheduled(cron = "0 0/2 * * * ?")
public void wxDetailconsumer (){
log.info("定时获取mq消息");
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_WXDETAILURL_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
SiteMsgTemple siteMsgTemple = JSON.parseObject(record.value().toString(), SiteMsgTemple.class);
WeixinDetailThread siteThread=new WeixinDetailThread();
siteThread.siteMsgTemple=siteMsgTemple;
// siteThread.start();
//创建使用固定线程数的线程池
serviceDetail.execute(()->{
String threadName= Thread.currentThread().getName();
System.out.println(threadName+"开始执行");
try {
siteThread.start();
TimeUnit.SECONDS.sleep(2);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println(threadName+"执行结束");
});
TimeUnit.SECONDS.sleep(10);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_WXDETAILURL_TOPIC));
}
}
private static KafkaConsumer<String, String> create2Consumer() {
Properties props = new Properties();
// 必须设置的属性
props.put("bootstrap.servers", "114.115.159.144:9092");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("group.id", "group1");
// 可选设置属性
props.put("enable.auto.commit", "true");
// 自动提交offset,每1s提交一次
props.put("auto.commit.interval.ms", "1000");
props.put("auto.offset.reset","earliest ");
props.put("client.id", "es-sync");
return new KafkaConsumer<>(props);
}
}
package com.zzsn.job;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.common.serialization.StringSerializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.kafka.annotation.EnableKafka;
import org.springframework.kafka.core.DefaultKafkaProducerFactory;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.kafka.core.ProducerFactory;
import java.util.HashMap;
import java.util.Map;
/**
* kafka生产者配置
* @author Java小白
*
*/
@Configuration
@EnableKafka
public class KafkaProducerConfig {
@Value("${kafka.producer.servers}")
private String servers;
@Value("${kafka.producer.retries}")
private int retries;
@Value("${kafka.producer.batch.size}")
private int batchSize;
@Value("${kafka.producer.linger}")
private int linger;
@Value("${kafka.producer.buffer.memory}")
private int bufferMemory;
/**
* 配置生产者信息(消费提供者信息)
* @return
*/
public Map<String, Object> producerConfigs() {
Map<String, Object> props = new HashMap<>();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, servers);
props.put(ProducerConfig.RETRIES_CONFIG, retries);
props.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize);
props.put(ProducerConfig.LINGER_MS_CONFIG, linger);
props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, bufferMemory);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.MAX_REQUEST_SIZE_CONFIG,"10485760");
return props;
}
/**
* 消费工厂
* @return
*/
public ProducerFactory<String, String> producerFactory() {
return new DefaultKafkaProducerFactory<>(producerConfigs());
}
/**
* 消息发送工具类
* @return
*/
@Bean
public KafkaTemplate<String, String> kafkaTemplate() {
//需要指定消费工厂
return new KafkaTemplate<String, String>(producerFactory());
}
}
\ No newline at end of file
package com.zzsn.job;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertyUtil {
//加载property文件到io流里面
public static Properties loadProperties(String propertyFile) {
Properties properties = new Properties();
try {
InputStream is = PropertyUtil.class.getClassLoader().getResourceAsStream(propertyFile);
if(is == null){
is = PropertyUtil.class.getClassLoader().getResourceAsStream("properties/" + propertyFile);
}
properties.load(is);
} catch (IOException e) {
e.printStackTrace();
}
return properties;
}
/**
* 根据key值取得对应的value值
*
* @param key
* @return
*/
public static String getValue(String propertyFile, String key) {
Properties properties = loadProperties(propertyFile);
return properties.getProperty(key);
}
}
package com.zzsn.job;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.util.Properties;
import java.util.Set;
public class RediscachedFactory {
private static final Logger Log = LoggerFactory.getLogger(RediscachedFactory.class);
private static final Logger logger = LoggerFactory.getLogger(JedisUtil.class);
private static JedisPool jedisPool = null;
/**
* 加载spring容器
* 创建人: 李东亮 MemcachedFactory
* 创建时间: 2015-5-30 上午11:39:31
* @version 1.0
*/
public static void init(){
Properties properties = PropertyUtil.loadProperties("conf/redis.properties");
String host = properties.getProperty("redis.host");
String port = properties.getProperty("redis.port");
String pass = properties.getProperty("redis.pass");
String timeout = properties.getProperty("redis.timeout");
String maxIdle = properties.getProperty("redis.maxIdle");
String maxTotal = properties.getProperty("redis.maxTotal");
String maxWaitMillis = properties.getProperty("redis.maxWaitMillis");
String testOnBorrow = properties.getProperty("redis.testOnBorrow");
JedisPoolConfig config = new JedisPoolConfig();
//控制一个pool可分配多少个jedis实例,通过pool.getResource()来获取;
//如果赋值为-1,则表示不限制;如果pool已经分配了maxActive个jedis实例,则此时pool的状态为exhausted(耗尽)。
config.setMaxTotal(Integer.parseInt(maxTotal));
//控制一个pool最多有多少个状态为idle(空闲的)的jedis实例。
config.setMaxIdle(Integer.parseInt(maxIdle));
//表示当borrow(引入)一个jedis实例时,最大的等待时间,如果超过等待时间,则直接抛出JedisConnectionException;
config.setMaxWaitMillis(Long.parseLong(maxWaitMillis));
//在borrow一个jedis实例时,是否提前进行validate操作;如果为true,则得到的jedis实例均是可用的;
config.setTestOnBorrow(Boolean.valueOf(testOnBorrow));
jedisPool = new JedisPool(config, host, Integer.parseInt(port), Integer.parseInt(timeout));
}
public static String getKeyStr(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
return null;
}
return jedis.get( key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
private static Jedis getJedis() {
return jedisPool.getResource();
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeyStr(String key, String value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
/* if (!client.isShutdown()) {
try {
client.shutdown();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}*/
}
return result;
}
public static boolean setKeyStrtime(String key, String value,Integer time) {
Jedis jedis=getJedis();
boolean result = false;
try {
if (StringUtils.isEmpty(key)) {
return false;
}
jedis.set(key, value);
if (time > 0) {
/**
* 如果设置了 expireTime, 那么这个 finalKey会在expireTime秒后过期,那么该键会被自动删除
* 这一功能配合出色的性能让Redis可以作为缓存系统来使用,成为了缓存系统Memcached的有力竞争者
*/
jedis.expire(key, time);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static Set<String> getKeySet(String key) {
Jedis jedis=getJedis();
try {
Set<String> obj = jedis.smembers(key);
if("null".equals(obj)){
return null;
}
return obj;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return null;
}
/**
* 设置缓存 永不过期,(一个月后会自动过期)
* @param key
* @return
*/
public static boolean setKeySet(String key, Set<String> value) {
Jedis jedis=getJedis();
boolean result = false;
try {
if(value==null){
return false;
}
jedis.sadd(key, value.toArray(new String[value.size()]));
} catch (Exception e){
e.printStackTrace();
} finally {
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
return result;
}
public static void del(String key) {
Jedis jedis=getJedis();
try {
if (StringUtils.isEmpty(key)) {
logger.error("key is null");
return;
}
jedis.del(key);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if (jedis != null) {
// 如果使用 JedisPool , close 操作不是关闭连接,代表归还连接池
jedis.close();
}
}
}
}
package com.zzsn.test;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import java.util.Map;
/**
* mq客户端
* 创建人:李东亮
* 创建时间:2016-4-6 上午9:51:06
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class MqReceiver {
private static final Logger Log = LoggerFactory.getLogger(MqReceiver.class);
public static void main(String[] args) {
WeiXinDispatch wx=new WeiXinDispatch();
ExtEntity extEntity=wx.getExtractorElement("https://mp.weixin.qq.com/s/-LE-dmgbTAWbzStiPUNynA");
Map<String, FileTag> imgDataMap= ContentFileFinder.getContentFileTag(extEntity.getContentWithTag());
System.out.println(extEntity.getContentWithTag());
String formatImgContent=extEntity.getContentWithTag();
for (String key : imgDataMap.keySet()) {
FileTag fileTag = imgDataMap.get(key);
while (formatImgContent.contains(key)) {
//转换为绝对路径
System.out.println(key);
String key2=key.replace("data-src","src");
formatImgContent = formatImgContent.replace(key, key2);
}
}
System.out.println(formatImgContent);
}
}
package com.zzsn.test;
import com.alibaba.fastjson.JSON;
import com.zzsn.crawler.WeixinSiteThread;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.job.KafkaConsumerJob;
import com.zzsn.util.Constants;
import org.apache.kafka.clients.CommonClientConfigs;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.util.Arrays;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
public class MsgReceiver {
public static void main(String[] args) {
wxOfficialConsumer();
}
public static void wxOfficialConsumer (){
//1.创建消费者
KafkaConsumer<String, String> consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_TOPIC));
try{
while(true){
//消费者是一个长期运行的程序,通过持续轮询向Kafka请求数据。在其他线程中调用consumer.wakeup()可以退出循环
//在0ms内等待Kafka的broker返回数据.超时参数指定poll在多久之后可以返回,不管有没有可用的数据都要返回
ConsumerRecords<String, String> records = consumer.poll(0);
consumer.commitSync();
for(ConsumerRecord record : records){
ClbAnsProcessitem siteMsgTemple = JSON.parseObject(record.value().toString(), ClbAnsProcessitem.class);
System.out.println(siteMsgTemple);
}
}
}catch (Exception e){
consumer = createConsumer();
consumer.subscribe(Arrays.asList(Constants.KAFKA_PRODUCT_TOPIC));
}
}
private static KafkaConsumer<String, String> createConsumer() {
Properties properties = new Properties();
properties.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, Constants.KAFKA_CONSUMER_SERVERS);
properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
properties.put(ConsumerConfig.GROUP_ID_CONFIG, Constants.KAFKA_CONSUMER_GROUP_ID);
properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
//kafka数据的读取方式
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,Constants.KAFKA_CONSUMER_AUTO_OFFSET_RESET);
// latest earliest
//时间间隔设置为1h
properties.put("max.poll.interval.ms", 60*60*1000);
properties.put(ConsumerConfig.MAX_POLL_RECORDS_CONFIG, 1);
return new KafkaConsumer<>(properties);
}
}
package com.zzsn.test;
import com.zzsn.util.Constants;
import com.zzsn.util.WeixinUtil;
public class Test {
public static void main(String[] args) throws Exception{
for (int i = 0; i < 1; i++) {
sendurl("https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzU0MTk2NzA0Nw==&scene=177#wechat_redirect");
Thread.sleep(30*1000);
}
}
public static void sendurl (String url){
try {
WeixinUtil.sendWxMessage(Constants.WXSENDNAME, "点击链接:"+url, 1000002, "ww6bef1e81aacbf27a",
"ttZJ_KbO3QABs5Z7IDHNa_X4CZizaojherzwzfQ7wl0");
// Thread.sleep(20*1000);
} catch (Exception e) {
// TODO Auto-generated catch block
try {
Thread.sleep(1*1000);
} catch (InterruptedException e1) {
// TODO Auto-generated catch block
}
e.printStackTrace();
}
}
}
package com.zzsn.util;
import org.apache.http.Header;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 判断html文档字符集帮助类
* 创建人:李东亮
* 创建时间:2016-5-27 上午10:44:48
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class CharsetUtil {
/**
* 获取字符集
* 创建人: 李东亮
* 创建时间: 2016-5-13 下午4:15:48
* @version 1.0
* @return
*/
public static String getCharset(String html,Header contentTypeheader){
Charset charset =null ;
if(contentTypeheader!=null){
charset = CharsetUtil.matchCharset(contentTypeheader.getValue());
}
//如果html文档内不包含字符集声明,从http协议中获取
if(charset==null){
charset = CharsetUtil.getContentDeclaredCharset(html);
}
//如果还没有获取到字符集则用默认字符集
if(charset==null){
return Constants.READ_CHARSET;
}
return charset.toString();
}
/**
* 获取文本中声明的字符集
* 创建人: 李东亮
* 创建时间: 2016-5-24 下午2:26:00
* @version 1.0
* @param content
* @return
*/
public static Charset matchCharset(String content) {
String charsetName = null;
Matcher matcher = Pattern.compile("charset=([^'\";\\s>]+)").matcher(content);
if(matcher.find())
{
charsetName = matcher.group(1);
if(charsetName.equals("2312"))
charsetName = "gb2312";
}
try
{
return Charset.forName(charsetName);
}
catch(IllegalArgumentException iae)
{
return null;
}
}
/**
* 获取文本中声明的字符集
* 创建人: 李东亮
* 创建时间: 2016-5-24 下午2:26:00
* @version 1.0
* @param content
* @return
*/
public static Charset getContentDeclaredCharset(String content) {
String contentPrefix = "";
if(contentPrefix.length() > 3000)
contentPrefix = content.substring(0, 3000);
else
contentPrefix = content;
String charsetName = null;
Matcher matcher = Pattern.compile("(?is)<meta\\s+[^>]*http-equiv\\s*=\\s*['\"]*content-type['\"]*[^>]*>").matcher(contentPrefix);
if(matcher.find())
{
Charset charset = matchCharset(matcher.group());
if(charset != null)
return charset;
}
if(charsetName == null)
{
matcher = Pattern.compile("(?si)<meta\\s+[^>]*charset=['\"]([^'\";\\s>]+)['\"]").matcher(contentPrefix);
if(matcher.find())
{
charsetName = matcher.group(1);
} else
{
matcher = Pattern.compile("(?is)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]").matcher(contentPrefix);
if(matcher.find())
{
charsetName = matcher.group(1);
return Charset.forName(charsetName);
} else
{
return null;
}
}
}
try
{
return Charset.forName(charsetName);
}
catch(IllegalArgumentException iae)
{
return null;
}
}
/**
* 获取html文本内容,并把乱码文本转换为UTF-8
* 创建人: 李东亮
* 创建时间: 2016-4-7 下午2:15:54
* @version 1.0
* @param html
* @param targetCharset
* @return
*/
public static String convertCorrectCharset(String html,String targetCharset) {
if(targetCharset==null){
return html;
}
if(!Constants.READ_CHARSET.equals(targetCharset.toString())){
try {
html = new String(html.getBytes(Constants.READ_CHARSET),targetCharset.toString());
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (html != null) {
html = preProcessHtml(html);
}
return html;
}
/**
* 预处理html文字
* 创建人: 李东亮
* 创建时间: 2015-6-15 下午2:51:43
* @version 1.0
* @param htm
* @return
*/
private static String preProcessHtml(String html) {
html = html.replaceAll("<\\!DOCTYPE\\s+html>", "<html>");
return html;
}
/**
* 转换为UTF-8
* 创建人: 李东亮
* 创建时间: 2016-5-13 下午4:24:18
* @version 1.0
* @param html
* @param sourceCharset
* @return
*/
public static String converCharsetToUTF8(String html,String sourceCharset){
//统一转换为"UTF-8"字符集
if (!sourceCharset.matches("(?i)UTF8|(?i)utf-8")) {
try {
html = CharsetUtil.convertCharset(html, sourceCharset.toString(),Constants.DEFAULT_CHARSET);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return html;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2015-10-28 上午9:35:01
* @version 1.0
* @return
*/
public static String convertCharset(String content, String sourceCharset, String targetCharset) throws UnsupportedEncodingException {
byte[] newtemp = new String(content.getBytes(sourceCharset), sourceCharset).getBytes(targetCharset);
String result = new String(newtemp, targetCharset);
return result;
}
}
package com.zzsn.util;
import lombok.extern.slf4j.Slf4j;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
/**
* 系统变量
*
*/
@Slf4j
public class Constants {
public static final String READ_CHARSET = "ISO8859-1";
public static String DEFAULT_CHARSET = "UTF-8";
private static Properties prop =getConfig();
public static Properties getConfig() {
Properties properties = new Properties();
InputStream is = null;
String location = "constants.properties";
try {
Resource resource = new DefaultResourceLoader().getResource(location);
is = resource.getInputStream();
properties.load(is);
log.debug("constants config: {}", properties.toString());
} catch (IOException ex) {
log.error("Could not load property file:" + location, ex);
} finally {
try {
if (is != null) {
is.close();
}
} catch (IOException ioe) {
// ignore
}
}
return properties;
}
public static final Integer DEV_MODEL =Integer.valueOf(prop.getProperty("DEV_MODEL"));
//微信爬虫相关的常量
public static final String WXSENDNAME =prop.getProperty("WXSENDNAME");
public static final String KAFKA_WXCONSUMER_TOPIC =prop.getProperty("KAFKA_WXCONSUMER_TOPIC");
public static final String KAFKA_WXDETAILURL_TOPIC =prop.getProperty("KAFKA_WXDETAILURL_TOPIC");
public static String path=prop.getProperty("path");
//输出目录,仅此处需要修改
//public static final String DEST_DIR = prop.getProperty("DEST_DIR");
//正文目录
public static final String CONTENT_DIR =prop.getProperty("CONTENT_DIR");
//图片目录
public static final String IMG_DIR = prop.getProperty("IMG_DIR");
//保存临界值
public static final Integer SAVE_LIMIT_SIZE = Integer.valueOf(prop.getProperty("SAVE_LIMIT_SIZE"));
//保存LIST时间间隔
public static final Integer SAVE_TIME_INTERVAL_MINUTE = Integer.valueOf(prop.getProperty("SAVE_TIME_INTERVAL_MINUTE"));
//缓存更新间隔时间
public static final Integer CACHE_UPDATE_INTERVAL = Integer.valueOf(prop.getProperty("CACHE_UPDATE_INTERVAL"));
//自动关键词数量
public static final Integer AUTO_KEYWORDS_SIZE = 10;
//Lucence 索引保存路径
public static final String LUCENE_INDEX_DIR = prop.getProperty("LUCENE_INDEX_DIR");
//Lucence 索引保存路径
public static final String LUCENE_TASK_INDEX_DIR = prop.getProperty("LUCENE_TASK_INDEX_DIR");
//爬取内容小于此大小的不保存
public static final Integer MIN_CONTENT_SIZE = 4;
//应用北大算法的分类id
public static final String FILTERTID = prop.getProperty("FILTERTID");
//相关性过滤算法URl(XGBOOST)
public static final String RELEVANCE_XGBOOST_URL = prop.getProperty("RELEVANCE_XGBOOST_URL");
//相关性过滤算法URl(LOGISTIC)
public static final String RELEVANCE_LOGISTIC_URL = prop.getProperty("RELEVANCE_LOGISTIC_URL");
//情感判断算法URl
public static final String SENTIMENT_ANALYSIS_URL = prop.getProperty("SENTIMENT_ANALYSIS_URL");
public static final String CHILDMODEID = prop.getProperty("CHILDMODEID");
//摘要最小长度
public static final Integer SUMMARY_MAX_LENGTH =350;
public static final Integer AUTO_TYPE_WEIGHT =Integer.valueOf(prop.getProperty("AUTO_TYPE_WEIGHT"));
//抽取
public static final String EXTRACTOR = "EXTRACTOR";
//噪音过滤
public static final String NOISEFILTER = "NOISEFILTER";
//黑名单
public static final String EXCLUDEWORD = "EXCLUDEWORD";
//排重
public static final String SIMILARITY = "SIMILARITY";
//分词
public static final String SEGMENT = "SEGMENT";
//自动摘要,自动关键词
public static final String GENERATION = "GENERATION";
//自动分类
public static final String AUTO_TYPE = "AUTO_TYPE";
//过滤分类
public static final String SYSTEMTYPEFILTER = "SYSTEMTYPEFILTER";
//企业识别
public static final String COMPANYFIND = "COMPANYFIND";
//过滤分类
public static final String SUBJECTKEYWORDSFILTER = "SUBJECTKEYWORDSFILTER";
//过滤信息源
public static final String SITEFILTER = "SITEFILTER";
//情感分析
public static final String EMOTION = "EMOTION";
//命名实体提取(特征词提取)
public static final String ENTITY = "ENTITY";
//爬取到的网页类型
public static final String TYPE_PDF = "PDF";
public static final String TYPE_WORD = "WORD";
public static final String TYPE_PPT = "PPT";
public static final String TYPE_EXCEL = "EXCEL";
public static final String TYPE_HTML = "HTML";
//缓存设置key
public static final String SYSTEMTYPE_CACHE_KEY = "systemtype";
public static final String KEYWORDS_CACHE_KEY = "keywords";
public static final String SYSTEMTYPEURL_CACHE_KEY = "systemtypeurl";
public static final String SYSTEMFILTER_CACHE_KEY = "systemfilter";
public static final String EXCLUDWWORD_CACHE_KEY = "excludword";
//打分分类
public static final String SCORE_ORGIN = "orgin";
public static final String SCORE_MEDIACOUNT = "MediaCount";
public static final String SCORE_INFOLENGTH = "InfoLength";
//Hash存放memcache中,单独一个key的最大数量
public static final int HASH_MEMCACHE_SIZE = Integer.valueOf(prop.getProperty("HASH_MEMCACHE_SIZE"));
//判断重复的rate
public static final double HASH_SIMILARITY_RATE = Double.valueOf(prop.getProperty("HASH_SIMILARITY_RATE"));
//专题分析OrgId
public static final Long SUBJECT_ANALYSIS_ORGID = Long.parseLong(prop.getProperty("SUBJECT_ANALYSIS_ORGID"));
//专题分析带评论的信息源的域名字符串
public static final String SUBJECT_COMMENT_URL = prop.getProperty("SUBJECT_COMMENT_URL");
//关键词过滤内容读url
public static final String CONTENT_EXCLE_URL = prop.getProperty("CONTENT_EXCLE_URL");
//关键词过滤关键词读url
public static final String KEYWORDS_EXCLE_URL = prop.getProperty("KEYWORDS_EXCLE_URL");
//关键词过滤输出url
public static final String RESULT_EXCLE_URL = prop.getProperty("RESULT_EXCLE_URL");
//关键词过滤执行线程数
public static final String THREAD_NUM = prop.getProperty("THREAD_NUM");
//专题去重处理 历史数据缓存 天数
public static final Integer SUBJECT_MEMCACHED_DAYS = Integer.valueOf(prop.getProperty("SUBJECT_MEMCACHED_DAYS"));
//境外央企监测去重处理 历史数据缓存 天数
public static final Long JWYQJC_MEMCACHED_DAYS = Long.parseLong(prop.getProperty("JWYQJC_MEMCACHED_DAYS"));
//境外央企监测输入文件url
public static final String JWYQJC_INFILE_URL = prop.getProperty("JWYQJC_INFILE_URL");
//判断重复的rate
public static final Double TITLE_SIMILARITY_RATE = Double.valueOf(prop.getProperty("TITLE_SIMILARITY_RATE"));
public static final String MODEL_SCORE_URL = prop.getProperty("MODEL_SCORE_URL");
public static final Integer CACHE_UPDATE = Integer.valueOf(prop.getProperty("CACHE_UPDATE"));
//国资监管评价中心相关性过滤算法URl(XGBOOST)
public static final String RELEVANCE_GZJG_XGBOOST_URL = prop.getProperty("RELEVANCE_GZJG_XGBOOST_URL");
// public static final boolean CRAL_TYPE = Integer.valueOf(prop.getProperty("CRAW_TYPE")).equals(1);
public static final Integer PROXYFLAG = Integer.parseInt(prop.getProperty("PROXY"));
public static final long PROXYID = Long.parseLong(prop.getProperty("PROXYID"));
public static final String CHROMEDRIVE = prop.getProperty("CHROMEDRIVE");
public static final String CHROMEBIN = prop.getProperty("CHROMEBIN");
public static final String KAFKA_CONSUMER_SERVERS= prop.getProperty("KAFKA_CONSUMER_SERVERS");
public static final String KAFKA_CONSUMER_TOPIC= prop.getProperty("KAFKA_CONSUMER_TOPIC");
public static final String KAFKA_CONSUMER_GROUP_ID= prop.getProperty("KAFKA_CONSUMER_GROUP_ID");
public static final String KAFKA_CONSUMER_AUTO_OFFSET_RESET= prop.getProperty("KAFKA_CONSUMER_AUTO_OFFSET_RESET");
public static final String PROXY= prop.getProperty("PROXY");
public static final String KAFKA_PRODUCT_TOPIC= prop.getProperty("KAFKA_PRODUCT_TOPIC");
/*统计信息发送的topic*/
public static final String KAFKA_COLLECT_TOPIC= prop.getProperty("KAFKA_COLLECT_TOPIC");
public static final String SOURCEADDRESS="SOURCEADDRESS_";
public static final String META_SEARCH_URL= prop.getProperty("META_SEARCH_URL");
// redis
public static final String REDIS_LOCALHOST= prop.getProperty("redis.host");
public static final String REDIS_PORT= prop.getProperty("redis.port");
public static final String REDIS_PASS= prop.getProperty("redis.pass");
public static final String REDIS_TIMEOUT= prop.getProperty("redis.timeout");
public static final String REDIS_MAXIDLE= prop.getProperty("redis.maxIdle");
public static final String REDIS_MAXTOTAL= prop.getProperty("redis.maxTotal");
public static final String REDIS_MAXWAITMILLIS= prop.getProperty("redis.maxWaitMillis");
public static final String REDIS_TESTONBORROW= prop.getProperty("redis.testOnBorrow");
}
package com.zzsn.util;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ContentUtility {
static Pattern divNoneP = Pattern.compile("(?s)<div[^>]*display:none[^>]*>.*?</div>", Pattern.CASE_INSENSITIVE);
static Pattern divP = Pattern.compile("<div>", Pattern.CASE_INSENSITIVE);
static Pattern divRP = Pattern.compile("</div>", Pattern.CASE_INSENSITIVE);
static Pattern brP = Pattern.compile("<br />", Pattern.CASE_INSENSITIVE);
static Pattern br2P = Pattern.compile("<br>", Pattern.CASE_INSENSITIVE);
static Pattern spaceP = Pattern.compile("&nbsp;", Pattern.CASE_INSENSITIVE);
static Pattern strongP = Pattern.compile("<strong>", Pattern.CASE_INSENSITIVE);
static Pattern strongRP = Pattern.compile("</strong>", Pattern.CASE_INSENSITIVE);
static Pattern pP = Pattern.compile("<p>", Pattern.CASE_INSENSITIVE);
static Pattern pRP = Pattern.compile("</p>", Pattern.CASE_INSENSITIVE);
static Pattern centerP = Pattern.compile("<center[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern centerRP = Pattern.compile("</center>", Pattern.CASE_INSENSITIVE);
static Pattern removeAttrP = Pattern.compile("<([a-zA-Z0-9]+)[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern commentP = Pattern.compile("(?s)<!--[^>]*>.*?<![^>]*-->", Pattern.CASE_INSENSITIVE);
static Pattern inputP = Pattern.compile("<input[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern formP = Pattern.compile("<form[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern formRP = Pattern.compile("</form>", Pattern.CASE_INSENSITIVE);
static Pattern buttonP = Pattern.compile("(?s)<button[^>]*>.*?</button>", Pattern.CASE_INSENSITIVE);
static Pattern iframeP = Pattern.compile("(?s)<iframe[^>]*>.*?</iframe>", Pattern.CASE_INSENSITIVE);
static Pattern noscriptP = Pattern.compile("(?s)<noscript>.*?</noscript>", Pattern.CASE_INSENSITIVE);
static Pattern objectP = Pattern.compile("(?s)<object[^>]*>.*?</object>", Pattern.CASE_INSENSITIVE);
static Pattern linkP = Pattern.compile("(?s)<link[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern imgReplaceP = Pattern.compile("<img([^>]*)>", Pattern.CASE_INSENSITIVE);
static Pattern imgRevReplaceP = Pattern.compile("<_img([^>]*)>", Pattern.CASE_INSENSITIVE);
static Pattern imgP = Pattern.compile("<img[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern imgRP = Pattern.compile("</img>", Pattern.CASE_INSENSITIVE);
public static Pattern aRemoveP = Pattern.compile("(?s)<a[^>]*>.*?</a>", Pattern.CASE_INSENSITIVE);
static Pattern legendRemoveP = Pattern.compile("(?s)<legend[^>]*>.*?</legend>", Pattern.CASE_INSENSITIVE);
static Pattern aP = Pattern.compile("<a[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern aRP = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE);
static Pattern fontP = Pattern.compile("<font[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern fontRP = Pattern.compile("</font>", Pattern.CASE_INSENSITIVE);
static Pattern hP = Pattern.compile("<h\\d[^>]*>", Pattern.CASE_INSENSITIVE);
static Pattern hRP = Pattern.compile("</h\\d>", Pattern.CASE_INSENSITIVE);
static Pattern ulRP = Pattern.compile("</ul>", Pattern.CASE_INSENSITIVE);
static Pattern liRP = Pattern.compile("</li>", Pattern.CASE_INSENSITIVE);
static Pattern trRP = Pattern.compile("</tr>", Pattern.CASE_INSENSITIVE);
static Pattern tdRP = Pattern.compile("</td>", Pattern.CASE_INSENSITIVE);
static Pattern textareaRemoveP = Pattern.compile("(?s)<textarea[^>]*>.*?</textarea>", Pattern.CASE_INSENSITIVE);
static Pattern selectRemoveP = Pattern.compile("(?s)<select[^>]*>.*?</select>", Pattern.CASE_INSENSITIVE);
static Pattern optionRemoveP = Pattern.compile("(?s)<option[^>]*>.*?</option>", Pattern.CASE_INSENSITIVE);
static Pattern labelRemoveP = Pattern.compile("(?s)<label[^>]*>.*?</label>", Pattern.CASE_INSENSITIVE);
static String regHTMLNumcode = "&#(\\d{4,5});";
static Pattern patHTMLNumCode = Pattern.compile(regHTMLNumcode);
public static String convertHtmlToUtf8(String html, Header contentTypeHeader)
{
String charset = "utf-8";
Document jsoupDoc = Jsoup.parse(html);
html = jsoupDoc.html();
return html;
}
/**
* 去掉无用的HTML标签,包括a等
* @param htmlText
* @return
*/
public static String RemoveUselessHTMLTagX(String htmlText) {
try {
htmlText = ContentUtility.RemoveStyleCode(htmlText);
htmlText = htmlText.replaceAll("&nbsp;", " ");
htmlText = divNoneP.matcher(htmlText).replaceAll("");
htmlText = textareaRemoveP.matcher(htmlText).replaceAll("");
htmlText = selectRemoveP.matcher(htmlText).replaceAll("");
htmlText = optionRemoveP.matcher(htmlText).replaceAll("");
htmlText = labelRemoveP.matcher(htmlText).replaceAll("");
htmlText = inputP.matcher(htmlText).replaceAll("");
htmlText = formP.matcher(htmlText).replaceAll("");
htmlText = buttonP.matcher(htmlText).replaceAll("");
htmlText = formRP.matcher(htmlText).replaceAll("");
// htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
htmlText = removeUselessAtt(htmlText);
// htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
// htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");
htmlText = commentP.matcher(htmlText).replaceAll("");
htmlText = legendRemoveP.matcher(htmlText).replaceAll("");
// htmlText = aP.matcher(htmlText).replaceAll("<sapn>");
// htmlText = aRP.matcher(htmlText).replaceAll("</sapn>");
htmlText = iframeP.matcher(htmlText).replaceAll("");
htmlText = noscriptP.matcher(htmlText).replaceAll("");
htmlText = objectP.matcher(htmlText).replaceAll("");
//htmlText = imgP.matcher(htmlText).replaceAll("");
//htmlText = imgRP.matcher(htmlText).replaceAll("");
htmlText = centerP.matcher(htmlText).replaceAll("");
htmlText = centerRP.matcher(htmlText).replaceAll("");
htmlText = htmlText.replaceAll("<cufontext>", "");
htmlText = htmlText.replaceAll("</cufontext>", "");
htmlText = htmlText.replaceAll("<cufon>", "");
htmlText = htmlText.replaceAll("</cufon>", "");
//htmlText = htmlText.replaceAll("(?s)<([a-zA-Z0-9]+)[^>]*>\\s*(</$1>)", "");
htmlText = htmlText.replaceAll("(?s)<ul[^>]*>\\s*</ul>", "");
htmlText = htmlText.replaceAll("(?s)<div[^>]*>\\s*</div>", "");
htmlText = htmlText.replaceAll("(?s)<p[^>]*>\\s*</p>", "");
htmlText = htmlText.replaceAll("(?s)<li[^>]*>\\s*</li>", "");
htmlText = htmlText.replaceAll("(?s)<canvas[^>]*>\\s*</canvas>", "");
return htmlText;
} catch (Exception e) {
e.printStackTrace();
return htmlText;
}
}
public static String RemoveHTMLCode(String src) {
src = src.replaceAll("(<[^>]*>)\\s*(<[^>]*>)", "$1$2");
src = divP.matcher(src).replaceAll("\n\n");
src = divRP.matcher(src).replaceAll("\n\n");
src = brP.matcher(src).replaceAll("\n\n");
src = br2P.matcher(src).replaceAll("\n\n");
src = spaceP.matcher(src).replaceAll(" ");
src = src.replaceAll("&#8226;", "??");
src = strongP.matcher(src).replaceAll("");
src = strongRP.matcher(src).replaceAll("");
src = pP.matcher(src).replaceAll("\n\n");
src = pRP.matcher(src).replaceAll("\n\n");
src = aP.matcher(src).replaceAll("");
src = aRP.matcher(src).replaceAll("");
src = imgP.matcher(src).replaceAll("");
src = fontP.matcher(src).replaceAll("");
src = fontRP.matcher(src).replaceAll("");
src = hRP.matcher(src).replaceAll("\n\n");
src = ulRP.matcher(src).replaceAll("\n\n");
src = liRP.matcher(src).replaceAll("\n\n");
src = trRP.matcher(src).replaceAll("\n\n");
src = tdRP.matcher(src).replaceAll("\n\n");
src = src.replaceAll("<[^>]*>", "");
return src.trim();
}
/**
* 去除除table的td外的无用的html标签属性
* 创建人: 李东亮
* 创建时间: 2016-7-14 下午5:01:20
* @version 1.0
* @param htmlText
* @return
*/
public static String removeUselessAtt(String htmlText) {
Matcher m = removeAttrP.matcher(htmlText);
Map<String, String> replaceMap = new HashMap<String, String>();
String tagPre;
while (m.find()) {
tagPre = m.group();
if (!tagPre.startsWith("<td ") && !tagPre.startsWith("<TD ") && !tagPre.startsWith("<th ") && !tagPre.startsWith("<TH ")
&& !tagPre.startsWith("<img ") && !tagPre.startsWith("<IMG ") && !tagPre.startsWith("<a ") && !tagPre.startsWith("<A ")) {
replaceMap.put(tagPre, removeAttrP.matcher(tagPre).replaceAll("<$1>"));
}
}
String replaceTagPre;
for (String key : replaceMap.keySet()) {
replaceTagPre = replaceMap.get(key);
while (htmlText.contains(key) && !key.equals(replaceTagPre)) {
htmlText = htmlText.replace(key, replaceTagPre);
}
}
return htmlText;
}
public static String HTMLDecode(String str) {
//
// 去掉一些HTML编码
str = str.replaceAll("&quot;", "\"");
str = str.replaceAll("&nbsp;", " ");
str = str.replaceAll("&middot;", "·");
str = str.replaceAll("&amp;", "&");
str = str.replaceAll("&ldquo;", "“");
str = str.replaceAll("&rdquo;", "”");
str = str.replaceAll("&gt;", ">");
str = str.replaceAll("&lt;", "<");
str = str.replaceAll("&raquo;", "??");
str = str.replaceAll("&times;", "×");
str = str.replaceAll("&ccedil;", "??");
str = str.replaceAll("&atilde;", "??");
str = str.replaceAll("&ecirc;", "ê");
// 去掉<>
//
str = str.replaceAll("<\\?[^>]*>", "");
Matcher matcher = patHTMLNumCode.matcher(str);
while (matcher.find()) {
str = matcher.replaceFirst(String.valueOf((char) Integer.parseInt(matcher.group(1))));
matcher = patHTMLNumCode.matcher(str);
}
String[] tmp = str.split(";&#|&#|;");
StringBuffer sb = new StringBuffer("");
for (int i = 0; i < tmp.length; i++) {
if (tmp[i].matches("\\d{4,5}")) {
sb.append((char) Integer.parseInt(tmp[i]));
} else {
sb.append(tmp[i]);
}
}
str = sb.toString();
return str;
}
public static String RemoveHTMLReturnCode(String src) {
//src = src.replaceAll("(<[^>]*>)[\r\n]+(<[^>]*>)", "$1$2");
src = src.replaceAll("\r", "");
src = src.replaceAll("\n", "");
return src;
}
/**
* 提取html字符串转中的普通文本,注意处理其中的回车符
* @param htmlText
* @return
*/
public static String TransferHTML2Text(String htmlText) {
if(htmlText==null){
return null;
}
String text = ContentUtility.HTMLDecode(ContentUtility.RemoveHTMLCode(ContentUtility.RemoveStyleCode(ContentUtility.RemoveHTMLReturnCode(htmlText))));
text = text.replaceAll("   ", "\r\n");
text = text.replaceAll(" +\r\n", "\r\n");
text = text.replaceAll(" +", " ");
text = text.replaceAll("[\\u00A0\\u3000]", "");
text = text.replaceAll(" ", "");
return text;
}
public static String RemoveStyleCode(String content) {
try {
Pattern p1 = Pattern.compile("(?s)<script\\s*.*?>(.*?)</script>", Pattern.CASE_INSENSITIVE);
Matcher m1 = p1.matcher(content);
content = m1.replaceAll("");
Pattern p2 = Pattern.compile("(?s)<style\\s*.*?>(.*?)</style>", Pattern.CASE_INSENSITIVE);
Matcher m2 = p2.matcher(content);
content = m2.replaceAll("");
Pattern p11 = Pattern.compile("(?s)<script\\s*.*?/>", Pattern.CASE_INSENSITIVE);
Matcher m11 = p11.matcher(content);
content = m11.replaceAll("");
Pattern p21 = Pattern.compile("(?s)<style\\s*.*?/>", Pattern.CASE_INSENSITIVE);
Matcher m21 = p21.matcher(content);
content = m21.replaceAll("");
content = noscriptP.matcher(content).replaceAll("");
content = objectP.matcher(content).replaceAll("");
content = linkP.matcher(content).replaceAll("");
Pattern p22 = Pattern.compile("(?s)<img\\s*.*?/>", Pattern.CASE_INSENSITIVE);
Matcher m22 = p22.matcher(content);
content = m22.replaceAll("");
// 去除注释
// Pattern p3 = Pattern.compile("(?s)<!--\\s*.*?>(.*?)-->");
Pattern p3 = Pattern.compile("(?s)<!--.*?-->");
Matcher m3 = p3.matcher(content);
content = m3.replaceAll("");
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
// /**
// * 获取类型
// * 创建人: 李东亮
// * 创建时间: 2016-9-14 下午2:44:21
// * @version 1.0
// * @param getMethod
// * @return
// */
// public static String getContentType(GetMethod getMethod){
// Header contentTypeHeader = getMethod.getResponseHeader("Content-Type");
// String contentType = "";
// if (contentTypeHeader!=null) {
// contentType = contentTypeHeader.getValue();
// }
//
// // TODO Auto-generated method stub
// if (contentType.startsWith("text/html")) {
// return Constants.TYPE_HTML;
// }else if(contentType.startsWith("image/")){
// return Constants.TYPE_IMG;
// }else
// if (contentType.startsWith("application/pdf")) {
// return Constants.TYPE_PDF;
// }else
// //word抽取,ppt抽取等。。。
// if (contentType.startsWith("application/msword")
// || contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
// return Constants.TYPE_WORD;
// }else
// if (contentType.startsWith("application/vnd.ms-powerpoint")
// || contentType.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
// return Constants.TYPE_PPT;
// }else
// if (contentType.startsWith("application/vnd.ms-excel")
// || contentType.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
// return Constants.TYPE_EXCEL;
// }
// return Constants.TYPE_HTML;
//
// }
/**
* 获取类型
* 创建人: 李东亮
* 创建时间: 2016-9-14 下午2:44:21
* @version 1.0
* @param getMethod
* @return
*/
public static String getContentType(HttpResponse getMethod){
Header contentTypeHeader = getMethod.getFirstHeader("Content-Type");
String contentType = "";
if (contentTypeHeader!=null) {
contentType = contentTypeHeader.getValue();
}
// TODO Auto-generated method stub
if (contentType.startsWith("text/html")) {
return Constants.TYPE_HTML;
}else
if (contentType.startsWith("application/pdf")) {
return Constants.TYPE_PDF;
}else
//word抽取,ppt抽取等。。。
if (contentType.startsWith("application/msword")
|| contentType.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
return Constants.TYPE_WORD;
}else
if (contentType.startsWith("application/vnd.ms-powerpoint")
|| contentType.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
return Constants.TYPE_PPT;
}else
if (contentType.startsWith("application/vnd.ms-excel")
|| contentType.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
return Constants.TYPE_EXCEL;
}
return Constants.TYPE_HTML;
}
public static void main(String[] args) {
String s="<div>\r\n" +
"   2019年4月1日起发布施行的新修订的《 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定办法》(以下简称“办法”)规定, \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定分为推广鉴定和专项鉴定。作为已经开展多年的鉴定工作,专项鉴定被赋予了全新的内涵,鉴定对象主要针对 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>创新产品,鉴定内容为创新性评价、安全性检查和适用地区性能试验等三项内容。 \r\n" +
" <br> \r\n" +
" <br>   对于从事 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定工作多年的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>试验鉴定机构来说,推广鉴定工作是其一直以来的主要工作,制度要求和工作流程是非常清晰及熟悉的,但是对于专项鉴定,认识还相对比较模糊。就鉴定产品如何确定、鉴定依据如何制定、鉴定工作如何开展以及鉴定证书颁发和后期管理都还处在积累经验阶段。从目前了解的情况看,已经有几个省在专项鉴定工作中走在了前面,福建省农业农村厅于2019年2月率先发布实施2个产品的专项鉴定大纲,黑龙江省农业农村厅于2019年5月发布实施1个产品的专项鉴定大纲,湖南省、内蒙古自治区、山东省和江苏省已经分别审定通过部分产品的专项鉴定大纲,正在报批发布过程中。 \r\n" +
" <br> \r\n" +
" <br>   自办法发布半年多来的专项鉴定工作实践来看,从大纲制定到鉴定实施还有一些问题需要澄清和解决,下面谈一下个人的思考与理解。 \r\n" +
" <br> \r\n" +
" <br>   一、鉴定机构资质问题 \r\n" +
" <br> \r\n" +
" <br>   办法明确了 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定是由 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定机构通过科学试验、检测和考核,对 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>的适用性、安全性和可靠性作出技术评价,为 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>的选择和推广提供依据和信息的活动。办法第六条确定了 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定的公益性服务属性。在第二章鉴定机构中则进一步明确了鉴定工作主体,第八条明确规定, \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定由省级以上人民政府 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>化行政主管部门所属或者指定的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构实施。所以说办法从法律角度确定了 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构的专项鉴定主体地位。 \r\n" +
" <br> \r\n" +
" <br>   由于目前国内各 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>试验鉴定机构情况有所不同,所以对开展专项鉴定工作还有些疑问,比如,农业农村部 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定总站能否开展专项鉴定?总站各专业站能否开展专项鉴定?从办法规定的逻辑关系来看,本不存在疑问, \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定包括推广鉴定和专项鉴定,农业农村部 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定总站属于农业农村部所属的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构,符合第八条规定,所以总站是可以开展专项鉴定的。各总站专业站属于由原农业部认可的由总站进行业务管理的专业 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定站,属于鉴定总站业务的延伸,应该也符合本条规定,所以也是可以开展专项鉴定的。就像数年前的选型鉴定,鉴定总站可以在全国范围内作选型鉴定,各省区市也可以根据自身情况在本省区市开展选型鉴定。 \r\n" +
" <br> \r\n" +
" <br>   问题出在办法第四条,即农业农村部主管全国 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定工作,制定并公布推广鉴定大纲。省、自治区、直辖市人民政府 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>化行政主管部门主管本行政区域的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定工作,制定并公布专项鉴定大纲,报农业农村部备案。对该条的直观解读造成大家的理解偏差,就是农业农村部负责发布推广鉴定大纲,连带着鉴定总站及专业站只能做推广鉴定而不能做其他,各省负责发布专项鉴定大纲,各省级鉴定站就可以做专项鉴定。正确的理解是农业农村部负责发布推广鉴定大纲,推广鉴定工作由全系统共同做,也就是鉴定总站及专业站和各省级鉴定机构共同承担。同样各省负责发布专项鉴定大纲,专项鉴定工作由本省及有需求的省区市分别来做,当然也就是鉴定总站及专业站和有关省级鉴定机构分别承担。如果从法律授权的角度看,这样的理解也是符合办法规定的,并未超出法律授权范围。 \r\n" +
" <br> \r\n" +
" <br>   如果说还有操作上的问题,那就是专项鉴定大纲的制定发布问题,办法虽然没有限制鉴定总站及专业站作专项鉴定,但总站开展专项鉴定产品的大纲如何制定、发布呢,我觉得按照办法规定,各省制定并公布各省的,农业农村部组织鉴定总站及各专业站可使用有关省的专项鉴定大纲开展鉴定。关于这点,《农业农村部关于印发&lt; \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定工作规范&gt;的通知》( \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>发〔2019〕3号)有相关要求,该工作规范第七条规定,大纲全国通用。各省在采用其他省专项鉴定大纲时,可以结合实际调整适用地区性能试验内容,以大纲修改单的形式公布,由本省鉴定机构实施。另外鉴定总站及专业站还可发挥更大组织协调作用,并与有关省站合作,制定适用于更大地域范围的专项鉴定大纲,既有利于统一规范,又可减少和避免各省站重复作业。 \r\n" +
" <br> \r\n" +
" <br>   另外对于由于机构改革,鉴定机构的原上级主管机关业务调整甚至取消了主管机关的,该鉴定站如果想开展专项鉴定,但其主管部门职能被取消,没有办法为其发布鉴定大纲了。此种情况,我认为,一种方式是争取取得该站所在省 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>化主管部门认可,成为其指定鉴定机构,承担一定范围内的专项鉴定工作,此方法与办法第八条规定并不相悖。据了解,福建省在这方面做出了一些探索,为了扩大 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定的有效供给,福建省在既保留原来鉴定机构的基础上,另外又指定了二家第三方鉴定机构来承担鉴定工作。其次还可以在办法允许的范围内,采用合作鉴定的方式,承担部分鉴定工作。 \r\n" +
" <br> \r\n" +
" <br>   二、鉴定机构检验检测资质问题 \r\n" +
" <br> \r\n" +
" <br>    \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构开展专项鉴定是否需要取得该产品的实验室资质认定呢?也就是说将来出具的专项鉴定报告是否需要加盖CMA标志呢?对于这个问题我觉得应该分情况区别对待,对于有产品标准试验方法的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>产品,还是应该按照资质认定管理相关规定,取得相应产品的实验室检验检测能力认可,这样开展工作出具检验检测报告可以加盖CMA标志,避免所谓超范围检验检测。 \r\n" +
" <br> \r\n" +
" <br>   但是对于创新型 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>产品可能比较难取得资质认定,因为创新型产品没有产品标准,最多有可能有一个企业标准或者技术要求,而企业标准又不能作为检测依据申请资质认定。专项鉴定大纲制定时可能参考了企业标准内容或实际产品的技术要求,没有现有国标或行标可供借鉴,其内容及要求也与现在的推广鉴定大纲不同。虽然原部级推广鉴定大纲可以被认可具有行业标准的作用,作为检测方法取得实验室资质认定,但是专项鉴定大纲是分别由各省发布实施,因为大纲检验检测内容少而非检测内容多以及其他各种各样的原因,可能较难甚至不会被各省作为检测方法认可,通过不了资质认定认可。各 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构也因为担心与《计量法》及实验室资质管理有关规定相抵触而心存疑虑,不敢放手开展专项鉴定工作。 \r\n" +
" <br> \r\n" +
" <br>   办法第九条对 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构应当具备的条件规定了三条,一是不以赢利为目的的公益性事业组织;二是具有与鉴定工作相适应的工作人员、场所和设施设备;三是具有符合鉴定工作要求的工作制度和操作规范。作为目前 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>试验鉴定系统来说,全系统各鉴定机构都已经具备相应条件,不存在软硬件上的缺失,需要的只是完善和提高。 \r\n" +
" <br> \r\n" +
" <br>   对于已经取得实验室资质认定的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>鉴定机构,开展工作有所顾虑的有两方面问题。 \r\n" +
" <br> \r\n" +
" <br>   一是《中华人民共和国计量法》和《中华人民共和国计量法实施细则》有关规定,计量法第二十一条规定,为社会提供公证数据的产品质量检验机构,必须经省级以上人民政府计量行政部门对其计量检定、测试的能力和可靠性考核合格。实施细则第二十九条规定,为社会提供公证数据的产品质量检验机构,必须经省级以上人民政府计量行政部门计量认证。第三十一条规定,……未取得计量认证合格证书的,不得开展产品质量检验工作。 \r\n" +
" <br> \r\n" +
" <br>   二是《检验检测机构资质认定管理办法》有关要求,其中第二十五条规定,检验检测机构应当在资质认定证书规定的检验检测能力范围内,依据相关标准或者技术规范规定的程序和要求,出具检验检测数据、结果。检验检测机构出具检验检测数据、结果时,应当注明检验检测依据,并使用符合资质认定基本规范、评审准则规定的用语进行表述。检验检测机构对其出具的检验检测数据、结果负责,并承担相应法律责任。另外第二十八条规定,检验检测机构向社会出具具有证明作用的检验检测数据、结果的,应当在其检验检测报告上加盖检验检测专用章,并标注资质认定标志。 \r\n" +
" <br> \r\n" +
" <br>   对于以上第一个问题,据掌握的情况,凡是开展有 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>检验检测业务的 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>试验鉴定各机构,均通过相应计量认证考核,取得了计量认证合格证书,是有资质的鉴定机构,开展工作不存在问题。比较有争议的是对第二个问题的理解。主要是,鉴定机构必须在资质认定证书规定的检验检测能力范围内开展检测鉴定工作,超过该范围的就不能做,因为出具的检验检测报告要标注资质认定标志,也就是加盖CMA标志,没有CMA标志的检验检测报告不能作为向社会出具的具有证明作用的检验检测数据、结果。 \r\n" +
" <br> \r\n" +
" <br>   其实这只是从问题的一个方面去进行阐述,要求规范检验检测机构的职业行为,但是却忽略了更多可能存在的市场需求和发展需要。在以产品试验方法标准作为资质认定检验检测能力确认依据的当下,各实验室在质量、成本、效益以及效率等因素下,恐怕都有相当多的产品标准进入不了实验室资质能力认定范围,更何况尚无标准的新产品呢!如果各检验检测机构只能在检验检测能力范围内开展工作,那么大量的创新产品和尚无检测方法的产品将无法获得检验检测的机会,社会将如何获取这些产品的技术和质量等信息呢?所以为了弥补以上制度未规定的情况,《&lt;检验检测机构资质认定管理办法&gt;释义》对该办法第二十五条进行解释时对超出检验检测能力范围开展检验检测给出了解决办法。也就是如果检验检测机构具备某些检验检测能力,但这些检验检测能力尚未取得资质认定(如尚无国家标准、行业标准、地方标准,从而无法取得资质认定),而检验检测机构根据特定委托方的合同约定,又需要实施相关检验检测并出具检验检测报告时,该报告不能使用资质认定标志(CMA标志),且必须在其检验检测报告的显着位置(如扉页、备注栏)注明相关检验检测依据不在资质认定范围内,检验检测数据和结果仅供特定委托方使用,不具有对社会的证明作用。此种情形,不属于超范围出具检验检测数据、结果。 \r\n" +
" <br> \r\n" +
" <br>   通过以上解释可以看出,检验检测机构不是不能做检验检测,只要特定委托方(比如 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>化主管部门、生产企业等)委托,就可以依据双方认可的检验检测方法开展检验检测,只要特定委托方认可,该机构出具的检验检测结果就可以服务于特定委托方。当然在开展工作之前,要按照实验室管理体系要求,对检验检测依据、检验检测人员、检验检测设备、环境等进行评估,达到一定要求后才可进行工作,出具报告时只要不使用资质认定标志(CMA标志)并在报告中注明检验检测数据和结果仅供特定委托方使用即可,这也不违反资质认定对实验室管理的要求。 \r\n" +
" <br> \r\n" +
" <br>   另外,鉴定机构出具的是专项鉴定报告,不是检验检测报告,其中仅有一小部分为检测或验证内容,鉴定结论是评价各方面技术内容后给出的综合结论,本身也不应该加盖实验室资质认定标志,更无必要在报告中作出限制性声明。 \r\n" +
" <br> \r\n" +
" <br>   基于以上分析,专项鉴定应根据实际需要,该检测的就进行检测,该评价的就进行评价,不能为规避检测能力问题该检测的也不检测,有意识删减检测项目。 \r\n" +
" <br> \r\n" +
" <br>   三、采信检验检测结果的问题 \r\n" +
" <br> \r\n" +
" <br>   TZ 6?2019《 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>专项鉴定大纲编写规则》6.9.2安全性检查规定:“可以采信检验检测机构依据相关国家标准、行业标准、地方标准、团体标准或企业标准出具的安全性检查报告。”6.9.3适用地区性能试验规定:“可以采信县级以上 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>主管部门、鉴定、推广、科研等单位开展的实地试验验证报告。”就目前发布或审定的专项鉴定大纲中可以看到,在安全性检查中规定,可采信具有资质的检验检测机构依据相关国家标准、行业标准、地方标准、团体标准、或企业标准出具的符合本大纲要求的安全性检查报告。在适用地区性能试验中规定,可采信县级以上 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>主管部门、鉴定、推广、科研等单位开展的实地试验验证报告,或具有资质的检验检测机构依据相关国家标准、行业标准、地方标准、团体标准或企业标准出具的检验检测报告。大纲的规定与编写规则基本一致,只是有的在适用地区性能试验中增加了检验检测机构的报告。 \r\n" +
" <br> \r\n" +
" <br>   对于要开展专项鉴定的新产品,一般尚无标准可依据,那么就没有检验检测机构能取得资质认定能力,既然没有具备资质的机构,又谈何采信有资质机构的报告呢?而作为新产品,如无政策引导,生产企业可能不会主动为产品做检验检测,检验检测机构也不会主动扩充能力,承担新产品的检验检测。 \r\n" +
" <br> \r\n" +
" <br>   如果专项鉴定大纲可被引入资质认定范围,那最先可能取得资质认定能力的也只能是作为大纲起草单位的鉴定机构自己,社会第三方检验检测机构如未发现有实质性收益前景的话,一般不会主动去获取检验检测能力的,由此看来,采信报告的办法在专项鉴定开展初期不一定靠得住。如果社会上没有具备资质的检验检测机构,也就是没有相应的有效社会资源提供检测服务而仍然要求企业提供可采信的有资质机构的检验检测报告,相当于用这种方式拒绝了企业鉴定要求。所以各鉴定机构要有充分的思想准备,立足自身独立开展专项鉴定。 \r\n" +
" <br> \r\n" +
" <br>   另外,对采信检验检测机构报告也要有一定的审查要求,首先是采信报告的真实性问题,这是最重要的,采信报告如果是虚假的,所形成的的专项鉴定结论的基础就不存在了。其次还有采信报告的产品信息与专项鉴定产品信息的一致性问题、检验检测依据与专项鉴定大纲的符合性问题等。这些都需要在采信时重点关注。 \r\n" +
" <br> \r\n" +
" <br>   四、创新性评价问题 \r\n" +
" <br> \r\n" +
" <br>   目前已发布的专项鉴定大纲中,对于创新性评价规定是要求制造商提供创新性证明材料,这些材料包括整机或部件的发明专利、实用新型专利、科技成果评价证书、科技成果查新报告等材料中的一种即可。也即大纲的规定是采信其他机构出具的相应证明材料,不管是整机还是部件,只要上述材料能提供其中的一种就认为符合要求,审核评价的工作显然做得不够。 \r\n" +
" <br> \r\n" +
" <br>   我觉得,创新性评价要有一个评审的过程,一是要对试验鉴定人员或者是评审专家要有一定要求,二是对制造商提供的材料要有一定要求,三是要有评审的原则把握和程序要求。对于试验鉴定人员或者评审专家,一般要求要了解鉴定工作,熟悉专项鉴定产品所处行业的基本现状,了解行业发展所存在的问题,具备客观评价行业和产品的经验,这样才能保证对专项鉴定产品做出较为中肯合理的评价而不至过于偏颇。对于用于提供评价信息的材料,可能不至一份,比如企业提供了发明专利和实用新型专利各一份,那么就要选出最能体现产品独特性、创新性的那份材料,兼顾其他进行综合评价。另外,应该要求制造商作为第一方对自己产品的创新性和区别于其他产品的显着特点提供说明。 \r\n" +
" <br> \r\n" +
" <br>   专项鉴定产品,因为是刚开发出来的产品,或者是在比较小地域范围内使用的产品,生产企业较少,使用用户应该也不会很多,为了保证评价结果的客观性、时效性,因此在评审原则的把握上,我觉得应注意以下几点,一是控制在一定的地域范围(或者生产作业条件下)内进行评价,根据产品试验验证情况及使用地域范围(或作业条件),进行评价,既不扩大也不缩小评价范围,保证既不要对产品太苛刻,使企业吃亏,又不要过于宽松夸大,而使评价者自身承担过度风险。二是控制在一定的时间范围内进行评价,保证产品创新性的现时适应性,一个产品或一项技术在过去某时是先进的,现在不一定仍然先进,所以评价时一定要说明产品在现在这个时期是否具有创新性。三是控制在一定的行业领域内进行评价,与同行业的同类产品比较,评价其是否开拓出新的生产方式,是否采用新材料、新工艺等等。四是要结合农时、农艺的特定要求进行评价,产品是否适应农时的紧迫性、连续性和持久性等要求,产品是否解决了农艺的关键性问题或产品对农艺要求的适应性是否适宜。五是如有可能,可以与国际同行业情况进行对照比较,进一步突出产品的特点。 \r\n" +
" <br> \r\n" +
" <br>   五、适用地区性能试验问题 \r\n" +
" <br> \r\n" +
" <br>   目前有一种观点,专项鉴定在性能试验方面要减少内容,试验项目和指标不能多,尤其是出检测数据的指标不能多,最好都是定性的指标。我觉得这还是头脑中关于检测能力资质问题的外化反映,专项鉴定要成为一个科学合理的鉴定,一定不是刻意淡化某方面的鉴定内容,而是根据实际情况抓住关键性能,确定考核指标和试验方法,以体现新产品特点。 \r\n" +
" <br> \r\n" +
" <br>   六、鉴定通过后产品的变更问题 \r\n" +
" <br> \r\n" +
" <br>   按照办法第二十一条规定, \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>专项鉴定证书的有效期为3年,有效期满仍符合现行鉴定大纲要求的,实行注册管理;不再符合鉴定大纲要求的,证书失效。专项鉴定对象为新产品,既是新产品,那么面对市场,产品出现变化的几率就大,有可能还是较大的变化,所以给出其鉴定证书较短的有效期。如果给出产品变更的限制要求,企业就会围绕或受限于这个限制范围,就有非常大的可能会限制产品的改进发展。另外专项鉴定大纲一致性控制的产品参数一般都是关键性参数,变化了可能产品结构就变了,就成为另一产品了。基于此,我认为大纲中不应有产品变更的章节,也就是不规定产品变更内容,只要大纲规定的关键参数发生变化,证书就失效,就需要将新产品重新做鉴定,重新获取证书。 \r\n" +
" <br> \r\n" +
" <br>   《农业农村部关于印发&lt; \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>试验鉴定工作规范&gt;的通知》( \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>发〔2019〕3号),将推广鉴定与专项鉴定的关系做了说明。在工作规范第七条中规定,专项鉴定大纲具备条件后应列入推广鉴定大纲制修订计划,转化为推广鉴定大纲。也就是说,经过专项鉴定以后的产品,如果市场需要,将逐步转化为推广鉴定,然后在推广鉴定证书有效期管理中按照推广鉴定大纲的产品变更要求进行管理。 \r\n" +
" <br> \r\n" +
" <br>    \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农业机械</a>专项鉴定工作正处于起步阶段,专项鉴定规则的确立、工作程序的规范以及鉴定结果的使用工作等,还需要各方参与者的共同努力,不断改进和完善,希望逐步使其发挥出推动创新产品发展的作用,支持中国 \r\n" +
" <a href=\"http://www.nyjx.cn\" target=\"_blank\">农机</a>化事业顺利发展。 \r\n" +
" <br> \r\n" +
"</div>";
String ss=ContentUtility.TransferHTML2Text(s);
System.out.println(ss);
}
}
package com.zzsn.util;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
/**
* 日期帮助类
* @author 李东亮
* @date 2014-7-16 上午10:03:38
* @company 郑州怡晟电子商务有限公司
*/
public class DateUtil
{
private static String defDtPtn = "yyyy-MM-dd HH:mm:ss";// 缺省日期格式
public static String getFirstDayOfMonth(Date date){
String str = "";
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
Calendar lastDate = Calendar.getInstance();
lastDate.setTime(date);
lastDate.set(Calendar.DATE,1);//设为当前月的1号
str=sdf.format(lastDate.getTime());
return str;
}
public static String tiemString2String(String string, boolean withtime)
{
if (string == null || string.equals(""))
return null;
try {
Long timeL=Long.parseLong(string);
} catch (NumberFormatException e1) {
return null;
}
Date d=new Date(Long.parseLong(string));
String ts = null;
if (withtime)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try
{
return sdf.format(d);
} catch (Exception e)
{
}
return null;
} else
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
try
{
return sdf.format(d);
} catch (Exception e)
{
}
return null;
}
}
public static String formatString(Date value, String format)
{
SimpleDateFormat sdf = new SimpleDateFormat(format);//格式化时间
String nowtime=sdf.format(value);//按以上格式 将当前时间转换成字符串
return nowtime;
}
// 根据身份证号获取生日
public static Date getBirthDay(String IDStr) {
String Ai = "";
Date date = null;
if (IDStr.length() == 18) {
Ai = IDStr.substring(0, 17);
} else if (IDStr.length() == 15) {
Ai = IDStr.substring(0, 6) + "19" + IDStr.substring(6, 15);
}
String strYear = Ai.substring(6, 10);// 年份
String strMonth = Ai.substring(10, 12);// 月份
String strDay = Ai.substring(12, 14);// 月份
try {
SimpleDateFormat s = new SimpleDateFormat("yyyy-MM-dd");
date = s.parse(strYear+"-"+strMonth+"-"+strDay);
} catch (Exception e) {
e.printStackTrace();
}
return date;
}
public static String format(String value) throws Exception
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");//格式化时间
Date date = sdf.parse(value);
String nowtime=sdf.format(date);//按以上格式 将当前时间转换成字符串
return nowtime;
}
public static int getGL(String strDate){
int gl = 0;
Date cjYear = null;
try {
SimpleDateFormat s = new SimpleDateFormat("yyyy/MM/dd");
cjYear = s.parse(strDate);
Calendar cal = Calendar.getInstance();
cal.setTime(cjYear);
gl = Calendar.getInstance().get(Calendar.YEAR)-cal.get(Calendar.YEAR)+1;
} catch (Exception e) {
e.printStackTrace();
}
return gl;
}
public static String getNextMonthFirst(Date date){
String str = "";
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
Calendar lastDate = Calendar.getInstance();
lastDate.setTime(date);
lastDate.add(Calendar.MONTH,1);//减一个月
lastDate.set(Calendar.DATE, 1);//把日期设置为当月第一天
str=sdf.format(lastDate.getTime());
return str;
}
public static int compareTime(String time1, String time2)
{
int result = 0;
try
{
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
Date date1 = df.parse(time1);
Date date2 = df.parse(time2);
result = date1.compareTo(date2);
} catch (Exception e)
{
e.printStackTrace();
}
return result;
}
public static String format(String time, String from_format, String to_format)
{
SimpleDateFormat sdf = null;
Date date = null;
if (time != null && !"".equals(time))
{
sdf = new SimpleDateFormat(from_format);
try
{
date = sdf.parse(time);
} catch (ParseException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
if (date != null)
{
sdf = new SimpleDateFormat(to_format);
}
}
return sdf.format(date);
}
/**
* 获取当前年
* @return
*/
public static String getCurrentYear(){
String format = "yyyy";
return DateUtil.format(new Date(), format);
}
/**
* 获取当前月份
* @return
*/
public static String getCurrentMonth(){
Calendar cal = Calendar.getInstance();
int month = cal.get(Calendar.MONTH) + 1;
return String.valueOf(month);
}
/**
* 获取当前年月份
* @return
*/
public static String getCurrentYearMonth(){
String format = "yyyy";
String year = DateUtil.format(new Date(), format);
Calendar cal = Calendar.getInstance();
int month = cal.get(Calendar.MONTH) + 1;
return year + "-" + month;
}
/**
* 获取当前日期
* @return
*/
public static String getCreateDate(){
String format = "yyyy-MM-dd HH:mm:ss";
return DateUtil.format(new Date(), format);
}
public static String format(Date d )
{
String format = "yyyy-MM-dd HH:mm:ss";
return DateUtil.format(d, format);
}
public static String format(Date d, String format)
{
if (d == null)
return "";
SimpleDateFormat myFormatter = new SimpleDateFormat(format);
return myFormatter.format(d);
}
public static String formatString(Date value)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");//格式化时间
String nowtime=sdf.format(value);//按以上格式 将当前时间转换成字符串
return nowtime;
}
/**
* 格式化给定时间
*
* @param date
* 需要格式化的时间
* @return 给定时间的格式化字符串(如:2008-11-22)
*/
public static String formatDate(Date date)
{
return new Date(date.getTime()).toString();
}
public final static String getDayOfWeekCn1(String dateStr) {
Calendar cal = Calendar.getInstance();
String weekCn = null;
Timestamp time = getTimeOfDateStr(dateStr);
if (time != null) {
cal.setTime(time);
int day = cal.get(Calendar.DAY_OF_WEEK);
switch (day) {
case 1:
weekCn = "日";
break;
case 2:
weekCn = "一";
break;
case 3:
weekCn = "二";
break;
case 4:
weekCn = "三";
break;
case 5:
weekCn = "四";
break;
case 6:
weekCn = "五";
break;
case 7:
weekCn = "六";
break;
default:
weekCn = "";
}
}
return weekCn;
}
public final static Timestamp getTimeOfDateStr(String dateStr) {
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
Timestamp time = null;
try {
Date da = df.parse(dateStr);
time = new Timestamp(da.getTime());
} catch (Exception e) {
e.printStackTrace();
}
return time;
}
public final static int getDayOfWeek(Date dateStr)
{
Calendar cal = Calendar.getInstance();
@SuppressWarnings("unused")
String weekCn = null;
int day = 0;
cal.setTime(dateStr);
day = cal.get(Calendar.DAY_OF_WEEK);
return day;
}
/**
* 将字符串按指定格式解析成日期对象
*
* @since 1.1
* @param dateStr
* 需要进行转换的日期字符串
* @param pattern
* 日期字符串的格式
* @return "yyyy-MM-dd HH:mm:ss"形式的日期对象
*/
public static Date parseDate(String dateStr, String pattern) {
SimpleDateFormat DATEFORMAT = new SimpleDateFormat(defDtPtn);
DATEFORMAT.applyPattern(pattern);
Date ret = null;
try {
ret = DATEFORMAT.parse(dateStr);
} catch (Exception e) {
e.printStackTrace();
}
DATEFORMAT.applyPattern(defDtPtn);
return ret;
}
public static String formatDate(Date d, int format, Locale locale, TimeZone timeZone)
{
if (d == null)
return "";
DateFormat df = DateFormat.getDateInstance(format, locale);
df.setTimeZone(timeZone);
return df.format(d);
}
public static String formatDateTime(Date d, int format1, int format2, Locale locale, TimeZone timeZone)
{
if (d == null)
return "";
DateFormat df = DateFormat.getDateTimeInstance(format1, format2, locale);
df.setTimeZone(timeZone);
return df.format(d);
}
/**
* 根据字符串返回指定格式的日期
*
* @param dateStr
* 日期(字符串)
* @param format
* 日期格式
* @return 日期(Date)
* @throws ParseException
*/
public static Date convertDate(String dateStr, String format) throws ParseException
{
Date date = null;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
date = simpleDateFormat.parse(dateStr);
return date;
}
public static long toLong(Date d)
{
if (d == null)
return 5338979352082120704L;
return d.getTime();
}
public static String toLongString(Date d)
{
return "" + toLong(d);
}
public static int getYear(Date date)
{
if (date == null)
return 0;
Calendar cal = Calendar.getInstance();
cal.setTime(date);
return cal.get(1);
}
public static Date parse(String timeMillis)
{
Date d = null;
try
{
d = new Date(Long.parseLong(timeMillis.trim()));
} catch (Exception e)
{
}
return d;
}
public static Date parse(String time, String format)
{
Date d = null;
try
{
d = parse(time, format, Locale.CHINA);
} catch (Exception e)
{
}
return d;
}
public static Date parse(String time, String format, Locale locale) throws Exception
{
if (time == null)
return null;
SimpleDateFormat sdf = new SimpleDateFormat(format, locale);
Date d = null;
d = sdf.parse(time);
return d;
}
public static String format(Calendar cal, String format)
{
if (cal == null)
return "";
SimpleDateFormat myFormatter = new SimpleDateFormat(format);
return myFormatter.format(cal.getTime());
}
public static Calendar add(Date d, int day)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(5, day);
return cal;
}
public static Date addDate(Date d, int day)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(5, day);
return cal.getTime();
}
public static Date addMonth(Date d, int m)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(2, m);
return cal.getTime();
}
public static Date addYear(Date d, int m)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(Calendar.YEAR, m);
return cal.getTime();
}
public static Date addHourDate(Date d, int h)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(10, h);
return cal.getTime();
}
public static Calendar addHour(Date d, int h)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(10, h);
return cal;
}
public static Date addMinuteDate(Date d, int m)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(12, m);
return cal.getTime();
}
public static Calendar addMinute(Date d, int m)
{
if (d == null)
return null;
Calendar cal = Calendar.getInstance();
cal.setTime(d);
cal.add(12, m);
return cal;
}
public static int compare(Calendar c1, Calendar c2)
{
if ((c1 == null) || (c2 == null))
return -1;
long r = c1.getTimeInMillis() - c2.getTimeInMillis();
if (r > 5338979730039242752L)
return 1;
if (r == 5338979730039242752L)
return 0;
return 2;
}
public static int compare(Date c1, Date c2)
{
if ((c1 == null) || (c2 == null))
return -1;
long r = c1.getTime() - c2.getTime();
if (r > 5338979730039242752L)
return 1;
if (r == 5338979730039242752L)
return 0;
return 2;
}
public static boolean isSameDay(Calendar c1, Calendar c2)
{
if ((c1 == null) || (c2 == null))
return false;
return ((c1.get(1) == c2.get(1)) && (c1.get(2) == c2.get(2)) && (c1.get(5) == c2.get(5)));
}
public static boolean isSameDay(Date d1, Date d2)
{
if ((d1 == null) || (d2 == null))
return false;
Calendar c1 = Calendar.getInstance();
c1.setTime(d1);
Calendar c2 = Calendar.getInstance();
c2.setTime(d2);
return ((c1.get(1) == c2.get(1)) && (c1.get(2) == c2.get(2)) && (c1.get(5) == c2.get(5)));
}
public static int datediff(Calendar c1, Calendar c2)
{
if ((c1 == null) || (c2 == null))
return -1;
long r = c1.getTimeInMillis() - c2.getTimeInMillis();
r /= 86400000L;
return (int) r;
}
public static int datediff(Date c1, Date c2)
{
if ((c1 == null) || (c2 == null))
return -1;
long r = c1.getTime() - c2.getTime();
r /= 86400000L;
return (int) r;
}
public static int datediffMinute(Date c1, Date c2)
{
if ((c1 == null) || (c2 == null))
return 0;
double r = c1.getTime() - c2.getTime();
r /= 60000.0D;
return (int) r;
}
public static int datediffMinute(Calendar c1, Calendar c2)
{
if ((c1 == null) || (c2 == null))
return 0;
double r = c1.getTimeInMillis() - c2.getTimeInMillis();
r /= 60000.0D;
return (int) r;
}
public static int datediffHour(Date c1, Date c2)
{
if ((c1 == null) || (c2 == null))
return 0;
double r = c1.getTime() - c2.getTime();
r /= 3600000.0D;
return (int) r;
}
public static int datediffHour(Calendar c1, Calendar c2)
{
if ((c1 == null) || (c2 == null))
return 0;
double r = c1.getTimeInMillis() - c2.getTimeInMillis();
r /= 3600000.0D;
return (int) r;
}
@SuppressWarnings("deprecation")
public static int[] dateDiffDHMS(Date d1, Date d2)
{
int diffDay = datediff(d1, d2);
int h1 = d1.getHours();
int h2 = d2.getHours();
int m1 = d1.getMinutes();
int m2 = d2.getMinutes();
int s1 = d1.getSeconds();
int s2 = d2.getSeconds();
int s = s1 - s2;
int m = m1 - m2;
if (s < 0)
{
s += 60;
--m;
}
int h = h1 - h2;
if (m < 0)
{
m += 60;
--h;
}
if (h < 0)
{
h += 24;
}
int[] r =
{ diffDay, h, m, s };
return r;
}
public static int getDayCount(int year, int month)
{
int[] daysInMonth =
{ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
if (1 == month)
{
return ((((0 == year % 4) && (0 != year % 100)) || (0 == year % 400)) ? 29 : 28);
}
return daysInMonth[month];
}
public static int getDaysOfYear(int year)
{
GregorianCalendar now = new GregorianCalendar();
return ((now.isLeapYear(year)) ? 366 : 365);
}
/**
* 转换时间类型时间变字符
*
* @param ts
* @param withtime
* @return
*/
public static String Timestamp2String(Timestamp ts, boolean withtime) //
{
if (ts == null)
return "";
if (withtime)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.format(ts);
} else
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
return sdf.format(ts);
}
}
/**
* 转换时间类型字符变时间
*
* @param string
* @param withtime
* @return
*/
public static Timestamp String2Timestamp(String string, boolean withtime)
{
Timestamp ts = null;
if (string == null || string.equals(""))
return ts;
if (withtime)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
try
{
ts = new Timestamp(sdf.parse(string).getTime());
} catch (Exception e)
{
}
return ts;
} else
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
try
{
ts = new Timestamp(sdf.parse(string).getTime());
} catch (Exception e)
{
}
return ts;
}
}
/**
* 把Long型字符转换成字符串时间
*
* @param string
* @param withtime
* @return
*/
public static String longToString(String format, long l_time)
{
SimpleDateFormat sdf = new SimpleDateFormat(format);
Date dt = new Date(l_time * 1000);
String time = sdf.format(dt);
return time;
}
/**
* 在YM的基础上增加count个月
*
* @param ym
* @param count
* @return
*/
public static String addMonth(String ym, int count)
{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM");
try
{
Date date = sdf.parse(ym);
Calendar c = Calendar.getInstance();
c.setTime(date);
c.add(2, count);
return sdf.format(c.getTime());
} catch (ParseException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
/**
* 格式化EXCEL导入时获取的日期
*
* @param date
* @return
*/
public static Date formatExcelDate(String date)
{
Calendar cld = Calendar.getInstance();
cld.set(1899, 11, 30);
cld.add(Calendar.DAY_OF_YEAR, Float.valueOf(date).intValue());
return cld.getTime();
}
/**
* 获取当前月第一天
*
* @return
*/
public static String firstDay()
{
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
Calendar c = Calendar.getInstance();
c.add(Calendar.MONTH, 0);
c.set(Calendar.DAY_OF_MONTH, 1);// 设置为1号,当前日期既为本月第一天
String first = format.format(c.getTime());
return first;
}
/**
* 获取当前月最后天
*
* @return
*/
public static String lastDay()
{
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");
Calendar ca = Calendar.getInstance();
ca.set(Calendar.DAY_OF_MONTH, ca.getActualMaximum(Calendar.DAY_OF_MONTH));
String last = format.format(ca.getTime());
return last;
}
public static String getWeekDayString(Date date)
{
String weekString = "";
final String dayNames[] =
{ "星期日", "星期一", "星期二", "星期三", "星期四", "星期五", "星期六" };
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK);
weekString = dayNames[dayOfWeek - 1];
return weekString;
}
/**
* 转换日期格式
* @param source
* @param sourceFormat
* @param targetFormat
* @return
* @throws ParseException
*/
public static String convertDate(String source,String sourceFormat,String targetFormat) throws ParseException{
SimpleDateFormat sdf = new SimpleDateFormat(sourceFormat);
Date date = sdf.parse(source);
sdf.applyPattern(targetFormat);
return sdf.format(date);
}
/* *//**
* 获取标准的本地时间
* 创建人: 李东亮
* 创建时间: 2015-7-2 上午10:32:25
* @version 1.0
* @param raw
* @return
*//*
public static String getPublishDate(String raw){
if(raw==null){
return null;
}
Date date = PublishDateUtil.transDate(raw);
if(date!=null){
Calendar c = Calendar.getInstance(TimeZone.getTimeZone("Asia/Shanghai"));
c.setTime(date);
Pattern p = Pattern.compile("(\\d{1,2})[:|:](\\d{1,2})([:|:]\\d{1,2}){0,1}");
Matcher m = p.matcher(raw);
while(m.find()){
String hour = m.group(1);
if(hour!=null){
c.set(Calendar.HOUR_OF_DAY, Integer.valueOf(hour));
}
String minute = m.group(2);
if(minute!=null){
c.set(Calendar.MINUTE, Integer.valueOf(minute));
}
String second = m.group(3);
if(second!=null){
c.set(Calendar.SECOND, Integer.valueOf(second.replaceAll(":|:", "")));
}
}
Date now = new Date();
if(c.getTimeInMillis()>now.getTime()){
c.setTime(now);
}
return DateUtil.format(c.getTime(),"yyyy-MM-dd HH:mm:ss");
}else
{
return null;
}
}*/
/**
* 格式化JANUARY 15TH, 2016这种类型的时间
* 创建人: 李东亮
* 创建时间: 2016-5-12 上午10:52:09
* @version 1.0
* @param dateStr
* @return
*/
public static String formatUSDate(String dateStr){
Date date = null;
SimpleDateFormat sdf1 = new SimpleDateFormat("MMMM d'st', yyyy",Locale.ENGLISH);
SimpleDateFormat sdf2 = new SimpleDateFormat("MMMM d'nd', yyyy",Locale.ENGLISH);
SimpleDateFormat sdf3 = new SimpleDateFormat("MMMM d'rd', yyyy",Locale.ENGLISH);
SimpleDateFormat sdf4 = new SimpleDateFormat("MMMM d'th', yyyy",Locale.ENGLISH);
SimpleDateFormat sdf6 = new SimpleDateFormat("MMMM d, yyyy",Locale.ENGLISH);
try {
date = sdf1.parse(dateStr);
} catch (ParseException e) {
try {
date = sdf2.parse(dateStr);
} catch (ParseException e1) {
try {
date = sdf3.parse(dateStr);
} catch (ParseException e2) {
try {
date = sdf4.parse(dateStr);
} catch (ParseException e3) {
try {
date = sdf6.parse(dateStr);
} catch (ParseException e4) {
// TODO Auto-generated catch block
e4.printStackTrace();
}
}
}
}
}
if(date!=null){
SimpleDateFormat sdf5 = new SimpleDateFormat("yyyy-MM-dd");
return sdf5.format(date);
}
return null;
}
/**
*
* 创建人: 李东亮
* 创建时间: 2016-8-15 下午3:09:14
* @version 1.0
* @param format
* @param date
* @return
*/
public static String convertStandardFormat(String sourceFormat,Date date){
String standFormat = "yyyy-MM";
if(sourceFormat.contains("dd")){
standFormat = "yyyy-MM-dd";
}else if(sourceFormat.contains("HH")){
standFormat = "yyyy-MM-dd HH";
}else if(sourceFormat.contains("mm")){
standFormat = "yyyy-MM-dd HH:mm";
}
SimpleDateFormat standardSdf = new SimpleDateFormat(standFormat);
return standardSdf.format(date);
}
public static String getDateBeforeDays(Date day, int days) {
Calendar now = Calendar.getInstance();
now.setTime(day);
now.set(Calendar.DATE,now.get(Calendar.DATE) - days);
return format(now.getTime(), "yyyy-MM-dd HH:mm:ss");
}
public static Long getDateid(){
Calendar now = Calendar.getInstance();
String id=format(now.getTime(), "yyMMdd");
return Long.parseLong(id+"00000000");
}
public static void main(String[] args) throws ParseException
{
//System.out.println(convertDate("2014年7月","yyyy年MM月","yyyyMM"));
System.out.println(getDateid());
}
}
\ No newline at end of file
package com.zzsn.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 常用方法
* 创建人:李东亮
* 创建时间:2015-5-14 下午6:19:59
* 公司 :郑州数能软件科技有限公司
* @version 1.0
*
*/
public class FileUtil {
private static final Logger Log = LoggerFactory.getLogger(FileUtil.class);
static int NAME= 0 ;
/**
* 正则获取匹配内容
* 创建人: 李东亮
* 创建时间: 2015-5-14 下午5:26:25
* @version 1.0
* @param html
* @param reg
* @return
*/
public static String reg(String html,String reg){
Pattern p = Pattern.compile(reg);//"(<[^>]*>)"
Matcher m = p.matcher(html);
List<String> result = new ArrayList<String>();
while(m.find()){
result.add(m.group());
}
StringBuffer sb = new StringBuffer();
for(String str : result){
sb.append(str);
sb.append(" ");
}
return sb.toString();
}
public static String readContent(InputStream inputstream){
StringBuffer content = new StringBuffer();
byte[] bytes = new byte[512];
try {
while(inputstream.read(bytes)!=-1){
content.append(new String(bytes));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content.toString();
}
/**
* 读取文章
* 创建人: 李东亮
* 创建时间: 2015-5-25 下午1:42:14
* @version 1.0
* @param inputstream
* @return
*/
public static String readHtml(InputStream inputstream,String chartset)throws Exception{
StringBuffer sb = new StringBuffer();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(inputstream,chartset));
// String buffer ;
char[] cbuf = new char[512];
int count;
while (( count = br.read(cbuf))!=-1){
sb.append(new String(cbuf,0,count));
}
// w.close();
}finally{
br.close();
}
return sb.toString();
}
/**
* 文件内容读入到字符串
* 创建人: 李东亮
* 创建时间: 2015-5-14 下午5:22:38
* @version 1.0
* @param file
* @return
*/
public static String readFile(File file,String chartset){
StringBuffer sb = new StringBuffer();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file),chartset));
char[] buffer = new char[512];
int index;
while ((index = br.read(buffer))!=-1){
// sb.append(" ");
sb.append(new String(buffer,0,index));
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return sb.toString();
}
/**
* 把字符串写入文件
* 创建人: 李东亮
* 创建时间: 2015-5-14 下午5:40:21
* @version 1.0
* @param file
* @param str
*/
public static void writeUTF8File(File file,String str,String charset){
FileOutputStream fos = null;
try {
fos = new FileOutputStream(file);
fos.write(("<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset="+charset+"\">").getBytes());
fos.write(str.getBytes(charset));
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
try {
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 将响应流写出到文件
* 创建人: 杨海龙
* 创建时间: 2015年7月10日 上午11:46:00
* @version 1.0
* @param file
* @param str
* @param charset
*/
public static void writeFile(File file,InputStream ips){
FileOutputStream fos = null;
try {
fos = new FileOutputStream(file);
byte[] b = new byte[1024];
int i;
while ((i = ips.read(b)) != -1) {
fos.write(b,0,i);
}
} catch (FileNotFoundException e2) {
e2.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
if(fos!= null){
try {
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 删除文件及文件夹的方法
* 创建人: 杨海龙
* 创建时间: 2015年6月12日 下午3:42:28
* @version 1.0
* @param file
*/
public static void deleteFile(File file){
if (file.isFile()) {
// 判断是否是文件
file.delete(); // delete()方法 你应该知道 是删除的意思;
} else if (file.isDirectory()) {
// 否则如果它是一个目录
File files[] = file.listFiles();
// 声明目录下所有的文件 files[];
for (int i = 0; i < files.length; i++) {
// 遍历目录下所有的文件
deleteFile(files[i]); // 把每个文件 用这个方法进行迭代
}
file.delete();
}
}
}
package com.zzsn.util;
import com.zzsn.common.TextContent;
public class TextMessage {
private String touser;
private String toparty;
private String totag;
private String msgtype;
private int agentid;
private TextContent text;
private int safe;
public TextContent getText() {
return text;
}
public void setText(TextContent text) {
this.text = text;
}
public String getTouser() {
return touser;
}
public void setTouser(String touser) {
this.touser = touser;
}
public String getToparty() {
return toparty;
}
public void setToparty(String toparty) {
this.toparty = toparty;
}
public String getTotag() {
return totag;
}
public void setTotag(String totag) {
this.totag = totag;
}
public String getMsgtype() {
return msgtype;
}
public void setMsgtype(String msgtype) {
this.msgtype = msgtype;
}
public int getAgentid() {
return agentid;
}
public void setAgentid(int agentid) {
this.agentid = agentid;
}
public int getSafe() {
return safe;
}
public void setSafe(int safe) {
this.safe = safe;
}
}
package com.zzsn.util;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.common.AccessToken;
import com.zzsn.common.MyX509TrustManager;
import com.zzsn.common.TextContent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.ConnectException;
import java.net.URL;
/**
* 公众平台通用接口工具类
*
* @author liuyq
* @date 2013-08-09
*/
public class WeixinUtil {
private static Logger log = LoggerFactory.getLogger(WeixinUtil.class);
public static final String get_tags_url = "https://qyapi.weixin.qq.com/cgi-bin/tag/list?access_token=ACCESS_TOKEN";
//jsapi_ticket是H5应用调用企业微信JS接口的临时票据
public static String message_send_url = "https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=ACCESS_TOKEN";
public static final String ACCESS_TOKEN_URL="https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid=APPID&corpsecret=APPSECRET";
/**
* 发起https请求并获取结果
*
* @param requestUrl 请求地址
* @param requestMethod 请求方式(GET、POST)
* @param outputStr 提交的数据
* @return JSONObject(通过JSONObject.get(key)的方式获取json对象的属性值)
*/
public static JSONObject httpRequest(String requestUrl, String requestMethod, String outputStr) {
JSONObject jsonObject = null;
StringBuffer buffer = new StringBuffer();
try {
// 创建SSLContext对象,并使用我们指定的信任管理器初始化
TrustManager[] tm = { new MyX509TrustManager() };
SSLContext sslContext = SSLContext.getInstance("SSL", "SunJSSE");
sslContext.init(null, tm, new java.security.SecureRandom());
// 从上述SSLContext对象中得到SSLSocketFactory对象
SSLSocketFactory ssf = sslContext.getSocketFactory();
URL url = new URL(requestUrl);
HttpsURLConnection httpUrlConn = (HttpsURLConnection) url.openConnection();
httpUrlConn.setSSLSocketFactory(ssf);
httpUrlConn.setDoOutput(true);
httpUrlConn.setDoInput(true);
httpUrlConn.setUseCaches(false);
// 设置请求方式(GET/POST)
httpUrlConn.setRequestMethod(requestMethod);
if ("GET".equalsIgnoreCase(requestMethod))
httpUrlConn.connect();
// 当有数据需要提交时
if (null != outputStr) {
OutputStream outputStream = httpUrlConn.getOutputStream();
// 注意编码格式,防止中文乱码
outputStream.write(outputStr.getBytes("UTF-8"));
outputStream.close();
}
// 将返回的输入流转换成字符串
InputStream inputStream = httpUrlConn.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream, "utf-8");
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String str = null;
while ((str = bufferedReader.readLine()) != null) {
buffer.append(str);
}
bufferedReader.close();
inputStreamReader.close();
// 释放资源
inputStream.close();
inputStream = null;
httpUrlConn.disconnect();
jsonObject = JSONObject.parseObject(buffer.toString());
} catch (ConnectException ce) {
log.error("Weixin server connection timed out.");
} catch (Exception e) {
log.error("https request error:{}", e);
}
return jsonObject;
}
/**
* 获取access_token
*
* @param appid 凭证
* @param appsecret 密钥
* @return
*/
public static AccessToken getAccessToken(String appid, String appsecret) {
AccessToken accessToken = null;
String requestUrl = ACCESS_TOKEN_URL.replace("APPID", appid).replace("APPSECRET", appsecret);
JSONObject jsonObject = httpRequest(requestUrl, "GET", null);
// 如果请求成功
if (null != jsonObject) {
try {
accessToken = new AccessToken();
accessToken.setToken(jsonObject.getString("access_token"));
accessToken.setExpiresIn(jsonObject.getInteger("expires_in"));
} catch (Exception e) {
accessToken = null;
// 获取token失败
log.error("获取token失败 errcode:{} errmsg:{}", jsonObject.getInteger("errcode"), jsonObject.getString("errmsg"));
}
}
return accessToken;
}
public static JSONObject sendWxMessage(String touser,String content,int agentid, String corpid, String secret){
TextMessage text = new TextMessage();
text.setAgentid(agentid);//应用id需要设置
text.setMsgtype("text");
text.setSafe(0);
text.setToparty("");
text.setTouser(touser);
TextContent contentC = new TextContent();
contentC.setContent(content);
text.setText(contentC);
//给正式号发送消息
String token = getAccessToken(corpid,secret).getToken();
// 拼装发送消息的url
String url = message_send_url.replace("ACCESS_TOKEN", token);
// 将发送消息转换成json字符串
String jsonData = JSONObject.toJSONString(text);
System.out.println(jsonData);
// 调用接口发送消息
JSONObject jsonObject = WeixinUtil.httpRequest(url, "POST", jsonData);
return jsonObject;
}
}
server.port=8079
#spring.jpa.database=MYSQL
#spring.datasource.driver-class-name=com.mysql.jdbc.Driver
#spring.datasource.url=jdbc:mysql://114.115.159.144:3306/meta_baidu?useUnicode=true&characterEncoding=utf-8&useSSL=false
#spring.datasource.username=root
#spring.datasource.password=zzsn9988
##spring.datasource.password=root
#spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
#spring.datasource.initialSize=1
#spring.datasource.minIdle=3
#spring.datasource.maxActive=20
#spring.datasource.maxWait=60000
#spring.datasource.timeBetweenEvictionRunsMillis=60000
#spring.datasource.minEvictableIdleTimeMillis=30000
#spring.datasource.validationQuery=select 'x'
#spring.datasource.testWhileIdle=true
#spring.datasource.testOnBorrow=false
#spring.datasource.testOnReturn=false
#spring.jpa.hibernate.ddl-auto=update
#mybatis-plus.mapper-locations*: classpath*:mapper/*Mapper.xml
#mybatis.mapper-locations=classpath*:/mapper/*Mapper.xml
logging.file.path=/log
logging.file.name=log/crawler.log
logging.root=info
#???????
kafka.producer.servers=114.115.159.144:9092
kafka.producer.retries=0
kafka.producer.batch.size=4096
kafka.producer.linger=1
kafka.producer.buffer.memory=40960
DEV_MODEL=0
CONTENT_DIR=D\://toy/dest/content
IMG_DIR=D\://toy/dest/img
SAVE_LIMIT_SIZE=1
AUTO_KEYWORDS_SIZE=10
LUCENE_INDEX_DIR=D\://toy/index
AUTO_TYPE_WEIGHT=20
SUMMARY_MAX_LENGTH=350
SAVE_TIME_INTERVAL_MINUTE=1
CACHE_UPDATE_INTERVAL=3000
LUCENE_TASK_INDEX_DIR=D\://toy/index/task
HASH_MEMCACHE_SIZE=100000
HASH_SIMILARITY_RATE=0.88
FILTERTID=
SUBJECT_ANALYSIS_ORGID=2321
SUBJECT_COMMENT_URL=news.163.com;news.sina.com.cn;www.toutiao.com;stock.10jqka.com.cn;guba.eastmoney.com
RELEVANCE_XGBOOST_URL=http://192.168.1.224:8009/api2/
RELEVANCE_LOGISTIC_URL=http://192.168.1.224:8007/api2
SENTIMENT_ANALYSIS_URL=http://118.190.174.96:8008/api2/
CONTENT_EXCLE_URL=D\://111.xls
KEYWORDS_EXCLE_URL=D\://typeword.xls
RESULT_EXCLE_URL=D\://data//subject//\u56FD\u4F01\u516C\u76CA\u5206\u7C7B//
THREAD_NUM=10
SUBJECT_MEMCACHED_DAYS=0
JWYQJC_INFILE_URL=D\://data//jwyqyqjc//keywords.txt
JWYQJC_MEMCACHED_DAYS=10
TITLE_SIMILARITY_RATE=0.8
MODEL_SCORE_URL=http://114.115.215.250:8088/score/getScoreByTidAndTypeNamePost
CACHE_UPDATE=1
PROXY=0
PROXYID=1
#google驱动地址
CHROMEDRIVE=E:\\chrome\\chromedriver.exe
CHROMEBIN=C:\\Users\\WIN10\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe
#mysql connection
mysql_url=jdbc:mysql://localhost:3306/clb_project?useUnicode=true&characterEncoding=utf8
mysql_username=root
mysql_password=root
#kafka服务地址
KAFKA_CONSUMER_SERVERS=114.115.159.144:9092
#微信公众号获取地址weChatInfo
KAFKA_CONSUMER_TOPIC = weChatInfo
#消费信息组
KAFKA_CONSUMER_GROUP_ID=wx-es-sync
KAFKA_CONSUMER_AUTO_OFFSET_RESET=earliest
#信息保存的topic
KAFKA_PRODUCT_TOPIC=wxcrawlerInfo1
#KAFKA_PRODUCT_TOPIC=crawlerInfo
#抓取资讯统计
KAFKA_COLLECT_TOPIC=collectionAndDispatcherInfo
#微信账号名称
WXSENDNAME= LiuWeiGang
#WXSENDNAME= lwg
path= E:\\ideaWorkerspace\\crawler_2022\\weixinCrawler\\src\\main\\resources\\static\\wechat-processor.templete
META_SEARCH_URL=https://www.google.com/search?hl=en&lr=lang_en&tbm=nws&sa=X&q=
#META_SEARCH_URL=https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=
# Redis settings
redis.host=127.0.0.1
redis.port=6379
redis.pass=xxxxxx
redis.timeout=10000
redis.maxIdle=300
redis.maxTotal=600
# \u6BEB\u79D2
redis.maxWaitMillis=1000
redis.testOnBorrow=false
#memcached.server=192.168.1.50:11211
#memcached.server=127.0.0.1:11211
memcached.server=114.116.122.247:11211
#memcached.server1.weight=1
memcached.connectionPoolSize=50
memcached.failureMode=true
memcached.connectTimeout=60000
memcached.opTimeout=5000
memcached.enableHealSession=true
memcached.statistics=true
memcached.binaryCommand=true
memcahced.sessionIdleTimeout=30
memcached.optimizeMergeBuffer=true
memcached.mergeFactor=50
\ No newline at end of file
# Redis settings
redis.host=127.0.0.1
redis.port=6379
redis.pass=xxxxxx
redis.timeout=10000
redis.maxIdle=300
redis.maxTotal=600
# \u6BEB\u79D2
redis.maxWaitMillis=1000
redis.testOnBorrow=false
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<template><content><exp>*.div[id="js_content"]</exp></content><title><exp>*.h1[class="rich_media_title"]|h2[class="rich_media_title"]</exp></title><author><exp>*.a[id="js_name"]</exp></author><publish_date><exp>*.em[id="post-date"]</exp></publish_date></template>
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论