yasserg · struberg · Nov 20, 2018 · Nov 20, 2018 · Nov 21, 2018 · Nov 25, 2018
diff --git a/crawler4j-examples/deadlinksniffer/README.adoc b/crawler4j-examples/deadlinksniffer/README.adoc
@@ -0,0 +1,45 @@
+= Crawler4j Dead Link Sniffer
+
+This application scans a web page for dead links.
+
+== Compiling
+
+The whole application can be built by using maven
+
+----
+$> mvn clean install
+----
+
+This will also bundle an executable application in `crawler4j-examples/deadlinksniffer/target/appassembler`.
+
+== Usage
+
+==== Getting more help
+For getting the parameter description:
+----
+$> ./bin/DeadLinkSniffer -?
+----
+
+==== Scanning a web page for dead links.
+Example how to scan a sample page for dead links:
+This will scan all sub pages which are reachable by all the `seed` (`-s`) pages given.
+----
+$> ./bin/DeadLinkSniffer -s=http://mypage.org
+----
+
+You can also define which URLs should be accessed via a list of regExp parameters `-u`.
+For defining multiple rules, simply add multiple `-u` parameters.
+
+----
+$> ./bin/DeadLinkSniffer -s=http://mypage.org -u="https://.*mypage.org.*"
+----
+
+==== Output
+
+By default the output files are in `./crawl`.
+The output directory can be specified with the `-o` parameter.
+
+The output directory contains a file `brokenPages.csv` which contains all broken links.
+The first row is the HTTP status, e.g. 404 for 'not found'.
+The second row is the name of the resource which is missing.
+The third row is the html page on which the dead link was found.
diff --git a/crawler4j-examples/deadlinksniffer/pom.xml b/crawler4j-examples/deadlinksniffer/pom.xml
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <artifactId>crawler4j-parent</artifactId>
+        <groupId>edu.uci.ics</groupId>
+        <version>4.5.0-SNAPSHOT</version>
+        <relativePath>../../pom.xml</relativePath>
+    </parent>
+    <artifactId>crawler4j-deadlinksniffer</artifactId>
+
+    <description>find dead links on a web page</description>
+    <url>https://github.com/yasserg/crawler4j</url>
+
+    <dependencies>
+        <dependency>
+            <groupId>edu.uci.ics</groupId>
+            <artifactId>crawler4j</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>1.4</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <!--
+                Application can be bundled via
+                $> mvn package appassembler:assemble
+                -->
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>appassembler-maven-plugin</artifactId>
+                <version>2.0.0</version>
+                <configuration>
+                    <programs>
+                        <program>
+                            <mainClass>edu.uci.ics.crawler4j.deadlinksniffer.DeadLinkCrawlController</mainClass>
+                            <id>DeadLinkSniffer</id>
+                        </program>
+                    </programs>
+                    <platforms>all</platforms>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>bundle</id>
+                        <phase>package</phase>
+                        <goals><goal>assemble</goal></goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+
+</project>
diff --git a/...dlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/...dlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.crawler4j.deadlinksniffer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
+
+/**
+ * @author <a href="mailto:[email protected]">Mark Struberg</a>
+ */
+public class DeadLinkCrawlConfig extends CrawlConfig {
+    private List<Pattern> urlPatterns = new ArrayList<>();
+    private volatile DeadLinkCrawlerStore crawlerStore;
+    private List<Pattern> excludePatterns = new ArrayList<>();
+
+    public List<Pattern> getUrlPatterns() {
+        return urlPatterns;
+    }
+
+    public List<Pattern> getExcludePatterns() {
+        return excludePatterns;
+    }
+
+    /**
+     * Add a regular expression for URLs which should be followed
+     * by the crawler.
+     */
+    public void addUrlPattern(String urlPattern) {
+        this.urlPatterns.add(Pattern.compile(urlPattern));
+    }
+
+    /**
+     * Add a regular expression for URLs which should be excluded from scanning.
+     * This is effectively a stop-criterium and will get evaluated
+     * after all the patterns added via {@link #addUrlPattern(String)}.
+     */
+    public void addExcludePattern(String excludePattern) {
+        this.excludePatterns.add(Pattern.compile(excludePattern));
+    }
+
+    public DeadLinkCrawlerStore getCrawlerStore() {
+        if (crawlerStore == null) {
+            synchronized (this) {
+                if (crawlerStore == null) {
+                    crawlerStore = new DeadLinkCrawlerStore(this);
+                }
+            }
+        }
+
+        return crawlerStore;
+    }
+}