Skip to content

Commit

Permalink
Download posts from Pleonast
Browse files Browse the repository at this point in the history
  • Loading branch information
danwatt committed Feb 21, 2013
0 parents commit c5873fc
Show file tree
Hide file tree
Showing 14 changed files with 300 additions and 0 deletions.
36 changes: 36 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
23 changes: 23 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>pleonast.downloader</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>
5 changes: 5 additions & 0 deletions .settings/org.eclipse.jdt.core.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
org.eclipse.jdt.core.compiler.compliance=1.5
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.5
4 changes: 4 additions & 0 deletions .settings/org.eclipse.m2e.core.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
21 changes: 21 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>4.0.0</modelVersion>
<groupId>com.online201</groupId>
<artifactId>pleonast.downloader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Pleonast Archiver</name>
<dependencies>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.11</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.1</version>
</dependency>
</dependencies>
</project>
34 changes: 34 additions & 0 deletions src/main/java/com/online201/pleonast/downloader/Comment.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package com.online201.pleonast.downloader;

import org.joda.time.DateTime;

public class Comment {
private String text;
private DateTime date;
private String comment;

public String getComment() {
return comment;
}

public DateTime getDate() {
return date;
}

public String getText() {
return text;
}

public void setComment(String comment) {
this.comment = comment;
}

public void setDate(DateTime date) {
this.date = date;
}

public void setText(String text) {
this.text = text;
}

}
45 changes: 45 additions & 0 deletions src/main/java/com/online201/pleonast/downloader/Entry.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package com.online201.pleonast.downloader;

import java.util.ArrayList;
import java.util.List;

import org.joda.time.DateTime;

public class Entry {
private String title;
private DateTime date;
private String body;
private List<Comment> comments = new ArrayList<Comment>();

public String getBody() {
return body;
}

public List<Comment> getComments() {
return comments;
}

public DateTime getDate() {
return date;
}

public String getTitle() {
return title;
}

public void setBody(String body) {
this.body = body;
}

public void setComments(List<Comment> comments) {
this.comments = comments;
}

public void setDate(DateTime date) {
this.date = date;
}

public void setTitle(String title) {
this.title = title;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package com.online201.pleonast.downloader;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomText;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;

public class PleoDownloader {

private static final DateTimeFormatter dateParser = DateTimeFormat.forPattern("MM/dd/yy HH:mmaa");
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
WebClient client = new WebClient(BrowserVersion.FIREFOX_3_6);
client.getOptions().setJavaScriptEnabled(false);
client.getOptions().setCssEnabled(false);
logIn(args, client);

HtmlPage currentPage = client.getPage("http://pleonast.com/users/"+args[0]);

List<HtmlElement> pagination = (List<HtmlElement>) currentPage.getByXPath("//div[@class='pagination']/span[@class='current']");
int numPages = Integer.parseInt(pagination.get(0).getTextContent().trim());
System.out.println("There are " + numPages +" pages worth of entries to parse");

List<Entry> entries = new ArrayList<Entry>();


entries.addAll(parseEntries(currentPage));
for (int i = 2; i <= numPages; i++) {
currentPage = client.getPage("http://pleonast.com/users/"+args[0]+"?page="+i);
entries.addAll(parseEntries(currentPage));
}
}

private static List<Entry> parseEntries(HtmlPage currentPage) {
List<Entry> entries = new ArrayList<Entry>();
List<HtmlElement> pageEntries = (List<HtmlElement>) currentPage.getByXPath("//div[@class='entry']");
for (HtmlElement pageEntry : pageEntries) {
Entry entry = new Entry();
entry.setTitle(pageEntry.getElementsByTagName("h1").get(0).getTextContent().trim());
String body = pageEntry.getElementsByAttribute("div", "class", "body").get(0).asXml();
body = StringUtils.substringAfter(body, ">");
body = StringUtils.substringBeforeLast(body, "<");
body = StringUtils.replace(body, " <br/>\n", "\n");
String date = pageEntry.getElementsByAttribute("div", "class", "byline").get(0).getElementsByTagName("span").get(0).getTextContent().trim();
entry.setDate(dateParser.parseDateTime(date.replace(" ", " ")));
entry.setBody(body);
entry.setComments(parseComments(pageEntry));
System.out.println(entry.getDate().toString() +" : " + entry.getTitle()+", " + entry.getComments().size() +" comments");
entries.add(entry);
}
return entries;
}

private static void logIn(String[] args, WebClient client)
throws IOException, MalformedURLException {
HtmlPage page = client.getPage("http://pleonast.com/login");
HtmlInput username = page.getElementByName("user_session[username]");
HtmlInput password = page.getElementByName("user_session[password]");

username.setValueAttribute(args[0]);
password.setValueAttribute(args[1]);
HtmlSubmitInput loginButton = (HtmlSubmitInput) page.getElementByName("commit");
loginButton.click();
}

private static List<Comment> parseComments(HtmlElement pageEntry) {
List<Comment> comments = new ArrayList<Comment>();
List<HtmlElement> pageComments = pageEntry.getElementsByAttribute("li", "class", "comment");
for (HtmlElement pageComment : pageComments) {
Comment c = new Comment();
HtmlElement right = pageComment.getElementsByAttribute("div", "class", "right").get(0);
HtmlElement body = right.getElementsByAttribute("div", "class", "body").get(0);
HtmlElement meta = right.getElementsByAttribute("div", "class", "meta").get(0);
String who = StringUtils.substringAfterLast(meta.getElementsByTagName("a").get(0).getAttribute("href"),"/");
for (DomNode node : meta.getChildren()) {
String text = node.getTextContent().trim();
if (node instanceof DomText && text.startsWith("at ") && text.endsWith("M")) {
c.setDate(dateParser.parseDateTime(StringUtils.substringAfter(text," ").replace(" ", " ")));
}
}
c.setText(who);
comments.add(c);
}
return comments;
}
}
5 changes: 5 additions & 0 deletions target/classes/META-INF/MANIFEST.MF
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Manifest-Version: 1.0
Built-By: danwatt
Build-Jdk: 1.6.0_37
Created-By: Maven Integration for Eclipse

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#Generated by Maven Integration for Eclipse
#Wed Feb 20 22:14:42 CST 2013
version=0.0.1-SNAPSHOT
groupId=com.online201
m2e.projectName=pleonast.downloader
m2e.projectLocation=/Users/danwatt/Documents/workspace/pleonast.downloader
artifactId=pleonast.downloader
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>4.0.0</modelVersion>
<groupId>com.online201</groupId>
<artifactId>pleonast.downloader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Pleonast Archiver</name>
<dependencies>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.11</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.1</version>
</dependency>
</dependencies>
</project>
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit c5873fc

Please sign in to comment.