diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..9fc2de7 --- /dev/null +++ b/.classpath @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..f658221 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + pleonast.downloader + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..abec6ca --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.5 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..f897a7f --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..55840f6 --- /dev/null +++ b/pom.xml @@ -0,0 +1,21 @@ + + 4.0.0 + com.online201 + pleonast.downloader + 0.0.1-SNAPSHOT + Pleonast Archiver + + + net.sourceforge.htmlunit + htmlunit + 2.11 + + + joda-time + joda-time + 2.1 + + + \ No newline at end of file diff --git a/src/main/java/com/online201/pleonast/downloader/Comment.java b/src/main/java/com/online201/pleonast/downloader/Comment.java new file mode 100644 index 0000000..a1df218 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/Comment.java @@ -0,0 +1,34 @@ +package com.online201.pleonast.downloader; + +import org.joda.time.DateTime; + +public class Comment { + private String text; + private DateTime date; + private String comment; + + public String getComment() { + return comment; + } + + public DateTime getDate() { + return date; + } + + public String getText() { + return text; + } + + public void setComment(String comment) { + this.comment = comment; + } + + public void setDate(DateTime date) { + this.date = date; + } + + public void setText(String text) { + this.text = text; + } + +} diff --git a/src/main/java/com/online201/pleonast/downloader/Entry.java b/src/main/java/com/online201/pleonast/downloader/Entry.java new file mode 100644 index 0000000..7802435 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/Entry.java @@ -0,0 +1,45 @@ +package com.online201.pleonast.downloader; + +import java.util.ArrayList; +import java.util.List; + +import org.joda.time.DateTime; + +public class Entry { + private String title; + private DateTime date; + private String body; + private List comments = new ArrayList(); + + public String getBody() { + return body; + } + + public List getComments() { + return comments; + } + + public DateTime getDate() { + return date; + } + + public String getTitle() { + return title; + } + + public void setBody(String body) { + this.body = body; + } + + public void setComments(List comments) { + this.comments = comments; + } + + public void setDate(DateTime date) { + this.date = date; + } + + public void setTitle(String title) { + this.title = title; + } +} diff --git a/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java b/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java new file mode 100644 index 0000000..a604783 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java @@ -0,0 +1,99 @@ +package com.online201.pleonast.downloader; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +import com.gargoylesoftware.htmlunit.BrowserVersion; +import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.DomNode; +import com.gargoylesoftware.htmlunit.html.DomText; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlInput; +import com.gargoylesoftware.htmlunit.html.HtmlPage; +import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; + +public class PleoDownloader { + + private static final DateTimeFormatter dateParser = DateTimeFormat.forPattern("MM/dd/yy HH:mmaa"); + public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { + WebClient client = new WebClient(BrowserVersion.FIREFOX_3_6); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setCssEnabled(false); + logIn(args, client); + + HtmlPage currentPage = client.getPage("http://pleonast.com/users/"+args[0]); + + List pagination = (List) currentPage.getByXPath("//div[@class='pagination']/span[@class='current']"); + int numPages = Integer.parseInt(pagination.get(0).getTextContent().trim()); + System.out.println("There are " + numPages +" pages worth of entries to parse"); + + List entries = new ArrayList(); + + + entries.addAll(parseEntries(currentPage)); + for (int i = 2; i <= numPages; i++) { + currentPage = client.getPage("http://pleonast.com/users/"+args[0]+"?page="+i); + entries.addAll(parseEntries(currentPage)); + } + } + + private static List parseEntries(HtmlPage currentPage) { + List entries = new ArrayList(); + List pageEntries = (List) currentPage.getByXPath("//div[@class='entry']"); + for (HtmlElement pageEntry : pageEntries) { + Entry entry = new Entry(); + entry.setTitle(pageEntry.getElementsByTagName("h1").get(0).getTextContent().trim()); + String body = pageEntry.getElementsByAttribute("div", "class", "body").get(0).asXml(); + body = StringUtils.substringAfter(body, ">"); + body = StringUtils.substringBeforeLast(body, "<"); + body = StringUtils.replace(body, "
\n", "\n"); + String date = pageEntry.getElementsByAttribute("div", "class", "byline").get(0).getElementsByTagName("span").get(0).getTextContent().trim(); + entry.setDate(dateParser.parseDateTime(date.replace(" ", " "))); + entry.setBody(body); + entry.setComments(parseComments(pageEntry)); + System.out.println(entry.getDate().toString() +" : " + entry.getTitle()+", " + entry.getComments().size() +" comments"); + entries.add(entry); + } + return entries; + } + + private static void logIn(String[] args, WebClient client) + throws IOException, MalformedURLException { + HtmlPage page = client.getPage("http://pleonast.com/login"); + HtmlInput username = page.getElementByName("user_session[username]"); + HtmlInput password = page.getElementByName("user_session[password]"); + + username.setValueAttribute(args[0]); + password.setValueAttribute(args[1]); + HtmlSubmitInput loginButton = (HtmlSubmitInput) page.getElementByName("commit"); + loginButton.click(); + } + + private static List parseComments(HtmlElement pageEntry) { + List comments = new ArrayList(); + List pageComments = pageEntry.getElementsByAttribute("li", "class", "comment"); + for (HtmlElement pageComment : pageComments) { + Comment c = new Comment(); + HtmlElement right = pageComment.getElementsByAttribute("div", "class", "right").get(0); + HtmlElement body = right.getElementsByAttribute("div", "class", "body").get(0); + HtmlElement meta = right.getElementsByAttribute("div", "class", "meta").get(0); + String who = StringUtils.substringAfterLast(meta.getElementsByTagName("a").get(0).getAttribute("href"),"/"); + for (DomNode node : meta.getChildren()) { + String text = node.getTextContent().trim(); + if (node instanceof DomText && text.startsWith("at ") && text.endsWith("M")) { + c.setDate(dateParser.parseDateTime(StringUtils.substringAfter(text," ").replace(" ", " "))); + } + } + c.setText(who); + comments.add(c); + } + return comments; + } +} diff --git a/target/classes/META-INF/MANIFEST.MF b/target/classes/META-INF/MANIFEST.MF new file mode 100644 index 0000000..d9ca927 --- /dev/null +++ b/target/classes/META-INF/MANIFEST.MF @@ -0,0 +1,5 @@ +Manifest-Version: 1.0 +Built-By: danwatt +Build-Jdk: 1.6.0_37 +Created-By: Maven Integration for Eclipse + diff --git a/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties new file mode 100644 index 0000000..3868981 --- /dev/null +++ b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties @@ -0,0 +1,7 @@ +#Generated by Maven Integration for Eclipse +#Wed Feb 20 22:14:42 CST 2013 +version=0.0.1-SNAPSHOT +groupId=com.online201 +m2e.projectName=pleonast.downloader +m2e.projectLocation=/Users/danwatt/Documents/workspace/pleonast.downloader +artifactId=pleonast.downloader diff --git a/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml new file mode 100644 index 0000000..55840f6 --- /dev/null +++ b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml @@ -0,0 +1,21 @@ + + 4.0.0 + com.online201 + pleonast.downloader + 0.0.1-SNAPSHOT + Pleonast Archiver + + + net.sourceforge.htmlunit + htmlunit + 2.11 + + + joda-time + joda-time + 2.1 + + + \ No newline at end of file diff --git a/target/classes/com/online201/pleonast/downloader/Comment.class b/target/classes/com/online201/pleonast/downloader/Comment.class new file mode 100644 index 0000000..e67b474 Binary files /dev/null and b/target/classes/com/online201/pleonast/downloader/Comment.class differ diff --git a/target/classes/com/online201/pleonast/downloader/Entry.class b/target/classes/com/online201/pleonast/downloader/Entry.class new file mode 100644 index 0000000..06d9821 Binary files /dev/null and b/target/classes/com/online201/pleonast/downloader/Entry.class differ diff --git a/target/classes/com/online201/pleonast/downloader/PleoDownloader.class b/target/classes/com/online201/pleonast/downloader/PleoDownloader.class new file mode 100644 index 0000000..9bb9e24 Binary files /dev/null and b/target/classes/com/online201/pleonast/downloader/PleoDownloader.class differ