Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deadlink #373

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions crawler4j-examples/deadlinksniffer/README.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
= Crawler4j Dead Link Sniffer

This application scans a web page for dead links.

== Compiling

The whole application can be built by using maven

----
$> mvn clean install
----

This will also bundle an executable application in `crawler4j-examples/deadlinksniffer/target/appassembler`.

== Usage

==== Getting more help
For getting the parameter description:
----
$> ./bin/DeadLinkSniffer -?
----

==== Scanning a web page for dead links.
Example how to scan a sample page for dead links:
This will scan all sub pages which are reachable by all the `seed` (`-s`) pages given.
----
$> ./bin/DeadLinkSniffer -s=http://mypage.org
----

You can also define which URLs should be accessed via a list of regExp parameters `-u`.
For defining multiple rules, simply add multiple `-u` parameters.

----
$> ./bin/DeadLinkSniffer -s=http://mypage.org -u="https://.*mypage.org.*"
----

==== Output

By default the output files are in `./crawl`.
The output directory can be specified with the `-o` parameter.

The output directory contains a file `brokenPages.csv` which contains all broken links.
The first row is the HTTP status, e.g. 404 for 'not found'.
The second row is the name of the resource which is missing.
The third row is the html page on which the dead link was found.
61 changes: 61 additions & 0 deletions crawler4j-examples/deadlinksniffer/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<artifactId>crawler4j-parent</artifactId>
<groupId>edu.uci.ics</groupId>
<version>4.5.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>crawler4j-deadlinksniffer</artifactId>

<description>find dead links on a web page</description>
<url>https://github.com/yasserg/crawler4j</url>

<dependencies>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<!--
Application can be bundled via
$> mvn package appassembler:assemble
-->
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<version>2.0.0</version>
<configuration>
<programs>
<program>
<mainClass>edu.uci.ics.crawler4j.deadlinksniffer.DeadLinkCrawlController</mainClass>
<id>DeadLinkSniffer</id>
</program>
</programs>
<platforms>all</platforms>
</configuration>
<executions>
<execution>
<id>bundle</id>
<phase>package</phase>
<goals><goal>assemble</goal></goals>
</execution>
</executions>
</plugin>
</plugins>
</build>


</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.crawler4j.deadlinksniffer;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;

/**
* @author <a href="mailto:[email protected]">Mark Struberg</a>
*/
public class DeadLinkCrawlConfig extends CrawlConfig {
private List<Pattern> urlPatterns = new ArrayList<>();
private volatile DeadLinkCrawlerStore crawlerStore;
private List<Pattern> excludePatterns = new ArrayList<>();

public List<Pattern> getUrlPatterns() {
return urlPatterns;
}

public List<Pattern> getExcludePatterns() {
return excludePatterns;
}

/**
* Add a regular expression for URLs which should be followed
* by the crawler.
*/
public void addUrlPattern(String urlPattern) {
this.urlPatterns.add(Pattern.compile(urlPattern));
}

/**
* Add a regular expression for URLs which should be excluded from scanning.
* This is effectively a stop-criterium and will get evaluated
* after all the patterns added via {@link #addUrlPattern(String)}.
*/
public void addExcludePattern(String excludePattern) {
this.excludePatterns.add(Pattern.compile(excludePattern));
}

public DeadLinkCrawlerStore getCrawlerStore() {
if (crawlerStore == null) {
synchronized (this) {
if (crawlerStore == null) {
crawlerStore = new DeadLinkCrawlerStore(this);
}
}
}

return crawlerStore;
}
}
Loading