From c5873fcdc696539ced79234bb07f612446de5a0c Mon Sep 17 00:00:00 2001 From: danwatt Date: Wed, 20 Feb 2013 22:15:14 -0600 Subject: [PATCH] Download posts from Pleonast --- .classpath | 36 +++++++ .project | 23 ++++ .settings/org.eclipse.jdt.core.prefs | 5 + .settings/org.eclipse.m2e.core.prefs | 4 + pom.xml | 21 ++++ .../pleonast/downloader/Comment.java | 34 ++++++ .../online201/pleonast/downloader/Entry.java | 45 ++++++++ .../pleonast/downloader/PleoDownloader.java | 99 ++++++++++++++++++ target/classes/META-INF/MANIFEST.MF | 5 + .../pleonast.downloader/pom.properties | 7 ++ .../com.online201/pleonast.downloader/pom.xml | 21 ++++ .../pleonast/downloader/Comment.class | Bin 0 -> 1023 bytes .../online201/pleonast/downloader/Entry.class | Bin 0 -> 1592 bytes .../pleonast/downloader/PleoDownloader.class | Bin 0 -> 7674 bytes 14 files changed, 300 insertions(+) create mode 100644 .classpath create mode 100644 .project create mode 100644 .settings/org.eclipse.jdt.core.prefs create mode 100644 .settings/org.eclipse.m2e.core.prefs create mode 100644 pom.xml create mode 100644 src/main/java/com/online201/pleonast/downloader/Comment.java create mode 100644 src/main/java/com/online201/pleonast/downloader/Entry.java create mode 100644 src/main/java/com/online201/pleonast/downloader/PleoDownloader.java create mode 100644 target/classes/META-INF/MANIFEST.MF create mode 100644 target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties create mode 100644 target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml create mode 100644 target/classes/com/online201/pleonast/downloader/Comment.class create mode 100644 target/classes/com/online201/pleonast/downloader/Entry.class create mode 100644 target/classes/com/online201/pleonast/downloader/PleoDownloader.class diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..9fc2de7 --- /dev/null +++ b/.classpath @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.project b/.project new file mode 100644 index 0000000..f658221 --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + pleonast.downloader + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/.settings/org.eclipse.jdt.core.prefs b/.settings/org.eclipse.jdt.core.prefs new file mode 100644 index 0000000..abec6ca --- /dev/null +++ b/.settings/org.eclipse.jdt.core.prefs @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5 +org.eclipse.jdt.core.compiler.compliance=1.5 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.5 diff --git a/.settings/org.eclipse.m2e.core.prefs b/.settings/org.eclipse.m2e.core.prefs new file mode 100644 index 0000000..f897a7f --- /dev/null +++ b/.settings/org.eclipse.m2e.core.prefs @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..55840f6 --- /dev/null +++ b/pom.xml @@ -0,0 +1,21 @@ + + 4.0.0 + com.online201 + pleonast.downloader + 0.0.1-SNAPSHOT + Pleonast Archiver + + + net.sourceforge.htmlunit + htmlunit + 2.11 + + + joda-time + joda-time + 2.1 + + + \ No newline at end of file diff --git a/src/main/java/com/online201/pleonast/downloader/Comment.java b/src/main/java/com/online201/pleonast/downloader/Comment.java new file mode 100644 index 0000000..a1df218 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/Comment.java @@ -0,0 +1,34 @@ +package com.online201.pleonast.downloader; + +import org.joda.time.DateTime; + +public class Comment { + private String text; + private DateTime date; + private String comment; + + public String getComment() { + return comment; + } + + public DateTime getDate() { + return date; + } + + public String getText() { + return text; + } + + public void setComment(String comment) { + this.comment = comment; + } + + public void setDate(DateTime date) { + this.date = date; + } + + public void setText(String text) { + this.text = text; + } + +} diff --git a/src/main/java/com/online201/pleonast/downloader/Entry.java b/src/main/java/com/online201/pleonast/downloader/Entry.java new file mode 100644 index 0000000..7802435 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/Entry.java @@ -0,0 +1,45 @@ +package com.online201.pleonast.downloader; + +import java.util.ArrayList; +import java.util.List; + +import org.joda.time.DateTime; + +public class Entry { + private String title; + private DateTime date; + private String body; + private List comments = new ArrayList(); + + public String getBody() { + return body; + } + + public List getComments() { + return comments; + } + + public DateTime getDate() { + return date; + } + + public String getTitle() { + return title; + } + + public void setBody(String body) { + this.body = body; + } + + public void setComments(List comments) { + this.comments = comments; + } + + public void setDate(DateTime date) { + this.date = date; + } + + public void setTitle(String title) { + this.title = title; + } +} diff --git a/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java b/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java new file mode 100644 index 0000000..a604783 --- /dev/null +++ b/src/main/java/com/online201/pleonast/downloader/PleoDownloader.java @@ -0,0 +1,99 @@ +package com.online201.pleonast.downloader; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +import com.gargoylesoftware.htmlunit.BrowserVersion; +import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; +import com.gargoylesoftware.htmlunit.WebClient; +import com.gargoylesoftware.htmlunit.html.DomNode; +import com.gargoylesoftware.htmlunit.html.DomText; +import com.gargoylesoftware.htmlunit.html.HtmlElement; +import com.gargoylesoftware.htmlunit.html.HtmlInput; +import com.gargoylesoftware.htmlunit.html.HtmlPage; +import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; + +public class PleoDownloader { + + private static final DateTimeFormatter dateParser = DateTimeFormat.forPattern("MM/dd/yy HH:mmaa"); + public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException { + WebClient client = new WebClient(BrowserVersion.FIREFOX_3_6); + client.getOptions().setJavaScriptEnabled(false); + client.getOptions().setCssEnabled(false); + logIn(args, client); + + HtmlPage currentPage = client.getPage("http://pleonast.com/users/"+args[0]); + + List pagination = (List) currentPage.getByXPath("//div[@class='pagination']/span[@class='current']"); + int numPages = Integer.parseInt(pagination.get(0).getTextContent().trim()); + System.out.println("There are " + numPages +" pages worth of entries to parse"); + + List entries = new ArrayList(); + + + entries.addAll(parseEntries(currentPage)); + for (int i = 2; i <= numPages; i++) { + currentPage = client.getPage("http://pleonast.com/users/"+args[0]+"?page="+i); + entries.addAll(parseEntries(currentPage)); + } + } + + private static List parseEntries(HtmlPage currentPage) { + List entries = new ArrayList(); + List pageEntries = (List) currentPage.getByXPath("//div[@class='entry']"); + for (HtmlElement pageEntry : pageEntries) { + Entry entry = new Entry(); + entry.setTitle(pageEntry.getElementsByTagName("h1").get(0).getTextContent().trim()); + String body = pageEntry.getElementsByAttribute("div", "class", "body").get(0).asXml(); + body = StringUtils.substringAfter(body, ">"); + body = StringUtils.substringBeforeLast(body, "<"); + body = StringUtils.replace(body, "
\n", "\n"); + String date = pageEntry.getElementsByAttribute("div", "class", "byline").get(0).getElementsByTagName("span").get(0).getTextContent().trim(); + entry.setDate(dateParser.parseDateTime(date.replace(" ", " "))); + entry.setBody(body); + entry.setComments(parseComments(pageEntry)); + System.out.println(entry.getDate().toString() +" : " + entry.getTitle()+", " + entry.getComments().size() +" comments"); + entries.add(entry); + } + return entries; + } + + private static void logIn(String[] args, WebClient client) + throws IOException, MalformedURLException { + HtmlPage page = client.getPage("http://pleonast.com/login"); + HtmlInput username = page.getElementByName("user_session[username]"); + HtmlInput password = page.getElementByName("user_session[password]"); + + username.setValueAttribute(args[0]); + password.setValueAttribute(args[1]); + HtmlSubmitInput loginButton = (HtmlSubmitInput) page.getElementByName("commit"); + loginButton.click(); + } + + private static List parseComments(HtmlElement pageEntry) { + List comments = new ArrayList(); + List pageComments = pageEntry.getElementsByAttribute("li", "class", "comment"); + for (HtmlElement pageComment : pageComments) { + Comment c = new Comment(); + HtmlElement right = pageComment.getElementsByAttribute("div", "class", "right").get(0); + HtmlElement body = right.getElementsByAttribute("div", "class", "body").get(0); + HtmlElement meta = right.getElementsByAttribute("div", "class", "meta").get(0); + String who = StringUtils.substringAfterLast(meta.getElementsByTagName("a").get(0).getAttribute("href"),"/"); + for (DomNode node : meta.getChildren()) { + String text = node.getTextContent().trim(); + if (node instanceof DomText && text.startsWith("at ") && text.endsWith("M")) { + c.setDate(dateParser.parseDateTime(StringUtils.substringAfter(text," ").replace(" ", " "))); + } + } + c.setText(who); + comments.add(c); + } + return comments; + } +} diff --git a/target/classes/META-INF/MANIFEST.MF b/target/classes/META-INF/MANIFEST.MF new file mode 100644 index 0000000..d9ca927 --- /dev/null +++ b/target/classes/META-INF/MANIFEST.MF @@ -0,0 +1,5 @@ +Manifest-Version: 1.0 +Built-By: danwatt +Build-Jdk: 1.6.0_37 +Created-By: Maven Integration for Eclipse + diff --git a/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties new file mode 100644 index 0000000..3868981 --- /dev/null +++ b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.properties @@ -0,0 +1,7 @@ +#Generated by Maven Integration for Eclipse +#Wed Feb 20 22:14:42 CST 2013 +version=0.0.1-SNAPSHOT +groupId=com.online201 +m2e.projectName=pleonast.downloader +m2e.projectLocation=/Users/danwatt/Documents/workspace/pleonast.downloader +artifactId=pleonast.downloader diff --git a/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml new file mode 100644 index 0000000..55840f6 --- /dev/null +++ b/target/classes/META-INF/maven/com.online201/pleonast.downloader/pom.xml @@ -0,0 +1,21 @@ + + 4.0.0 + com.online201 + pleonast.downloader + 0.0.1-SNAPSHOT + Pleonast Archiver + + + net.sourceforge.htmlunit + htmlunit + 2.11 + + + joda-time + joda-time + 2.1 + + + \ No newline at end of file diff --git a/target/classes/com/online201/pleonast/downloader/Comment.class b/target/classes/com/online201/pleonast/downloader/Comment.class new file mode 100644 index 0000000000000000000000000000000000000000..e67b474320fef4d6a82665cacfa84ca14be30388 GIT binary patch literal 1023 zcma)3O;6iE5Ph437>wJ{Kp>E`&@U+gsav?V;zHGv)gFkrZ{k(z$o5vw^6|4;2~??v z{(%0dsx!6`#W+$gJG-;<-pqUR`_Io`01olmK}}#dF|jmB6ejxU?Vi>4_C+SbasI?caT!+B8h2@8~a3Rtt1}lTPK# z1WMX4*75^oM|7bIhx#+SK=XZ=g!WjVJ{X=6^uz?(MIDblxL6fv`84CF^LVV&ks3!t z+xE>wMQ17v`CTL%_DeVyIPm{l7){|$wJl@{Yz&4EwX8s3)58jyG&5DR1c4nYT_BAC zJ02V?=YSEp#JQ&)mYBqy|I@NFc-WCmaHuHn)Eg{nbX&&_i1yT+rxX1#WP`jy(;Hq5 zo>TWat-g*Lk3hE?kC#Q#MbV)O@|sSMUXr4qxU~NRkG^MREjpWd5^Hq#iu)6^=_Rm^ z4La|GuhIEd0{Nh3OmSt?PWf*1^2kN%UnI;>M!8ptuWV%>bd%5u3h5VTfoJeX0Dgn4qp8Q D3-YAv literal 0 HcmV?d00001 diff --git a/target/classes/com/online201/pleonast/downloader/Entry.class b/target/classes/com/online201/pleonast/downloader/Entry.class new file mode 100644 index 0000000000000000000000000000000000000000..06d9821fcb25a64860b05abdb37e66e6dadce9b2 GIT binary patch literal 1592 zcmb7E?@!ZE6g_Xh8HM2n1DOH}NVg#c)K4HPsGk-!fspvI+nQO@wWNE~?7z}PLZXR( zfPa+n-q((evPJpQ_j=#G=bdxUZGZpy@e9BXo~02J*l2id!*d?) zg_`HLj5Duk8q#iC#%qe6(1tVWUb82VAr@`Rl^wbx0P0HHF=}>4Qqsh+-EvLY^$F{F zu^q>q!y02#_t@&vNS zUe|9}Z*BHJs|?)Y;lTq^9;dG?ff&oDU5tC9R&q|QG)9!Asr?lBDR=pxMU?1G;2!RWcm}PEm#lop(pTEV zz#u6V%ncPMuZmYlL@cEDP>~u2DMmpGBSBWNHiE%=w0M4G@y1B{dbGGSvbch3c=G|t zh_Tp8^#V(S4rWz!nfzH%2*n^PYydZho#MW2;xU~CPHm}He4zI$Oyx>kH8j9nCCs%k zn1SmC@d?~Y(eGqR-Lg&p?P F{R>D&CEEZ1 literal 0 HcmV?d00001 diff --git a/target/classes/com/online201/pleonast/downloader/PleoDownloader.class b/target/classes/com/online201/pleonast/downloader/PleoDownloader.class new file mode 100644 index 0000000000000000000000000000000000000000..9bb9e244a16858ef93260b74a2909baf25f12a0a GIT binary patch literal 7674 zcmbtZ349#YdH;X9X0#du-o?TT1lGo2$+EqcZ49hrV@bA(MB_w+LWI^~TT>I9pn>~z%5q>>qH^QO*dHf7lvGv`L*_G~6) zn{msDj`G{C@~egh!DT1RlV&t!W)ji8lP9bxR}hGst~F{pIm;2W4cSg2dcuyIQ8$^k zqSLmMHr?nhP8sKWzxV1|PM1LMp5o?7cZZ<9t$jk!&}YXj9WC5tcsLr5N9X1u0|T*i z+B5@LhD$YEBDk&w`35e-aw^8nM@^TzWdz-ALkp^pxlS^Z=xVQ3*=7hr1{%>MxO9jf z8_B09Eoa=EOj&~9kUeFlCQK)(-u;hF-JWd$)MKTAR_L_Pok`{dn}=Sm=jp75w3(z8 zt!+oDm_gqKcHc8)W!v zmr7b0w;5~EuAxn^{=zGI9eb7lo3NZ5Z8WeB9fFqr!F{{?_Z~d9<=8gCM$fM=ZS!Ig zT>-4Y1{Il&g3h*=x3+ykt=?py6PxM#gyrrn`kv8kYrlxBvhn^JYO)pEG~7rNF6@M* zD-GO)n+2EWEO!sna%{>;X5HNxg~B)?({@hU27aIzq)~u+m}0Qa}qQFN)!^g}YN$n)(J{B5uG^0l9>-$E|zZ zK0D)jLkT1`Q1gnaQ;e8_6KX*Nc|09J3KN>n7n!vkcP3&_M`*f}{N;!1F_rHeptubIqz0w`gR7Am5R;}RjF>A#Xf;P5!B4A zESN2RJ}6~L5--a(29tXEc}7SSgT7!Y0hP(8m1OEHgLJ+hGQK5Qq!;%CbJ-cGf#=Y>5`P+0QzaGw|p53s#FBEAv^U8+Ck_6gWAj4r)67nvqjwU&r@}%1A`V-%<-!oCJQTwD|ia zDD0atoiXd~yp@@;UVAFg+ftO@I`|m5}L#eE1GmA$!)P)%Qgk1 z9z&Y&sNGt9216}PhG?Qw;z=`=xAsmmN|k>0xvolBbqSCTA~6Eylo*CA^ZH=bc6nqW zml9O#y}{*%1SLchIk;eAa1kq)8b(=RNUL0-pqadv5#w=wP?xZvIiiwSxt=U9R~d4( zyo$r@4wf`xC5;c0-{o3EBC?9s&6v3nkIE^K&1{Qm7E~!?b41ytw>jQryxc-c0`kgF zsi&x+7_jsP&u0_b94AV3S;uCl+S!IDt7Jx(^^B)F<{ZmeIdy_PqFyqr8me8FjfO;J z6FZFs4(^>R)|9@=!1{c3u|CM_)L0ox7}jL-MGx5J?rt!X&AYm6U8-=F=MT21@uLCh zlx>FGq>e?)nE(?CgXMam%Pp#7W7w0fCUn|nK40J_qP*`|ev(VP-2)Pn?S||Sbu?x3 zH+7c+_{FSN{TTOaP`DsZmmGZYi-EEV-sTj+<3iC^V$O?SZ{Bt7j9_c6IIA#*^6Xd4 zI<@p#ewMP$(`A5poJ#7#gRi*<+0PF%jy27?E(Z)bD2I5cAcD(SSVBu>^e!CJN(ESsP$X{vz92^pxl{tMqHBwm>drN^_-=Zmf)rNqbX(6W zZeBB~yIMXqL5IeFEEjxh=db#&IT)CS@o@y7@*o?j~w zU%(ds_m*`X^XTq4i=Bspy{EBjUH#?)ZY$vS0`@+OvElW>{RJEt*-(E5N9yn_%mStg zn4ZU7+Zvw2-QK!cu3OPiz+CV(rL_;7#Y2Zq;|;OKaO3)Uyzva)QilVL+nOH7wxyc| z-^Nudnz(*p+saiBH_3{o6%CKR_&Mc`de4D7a4Da!Lk&)b$6)ewB0T@M~UN zzR&MEe@0~9` zriB}yLNKfa|0ei@0{*Ume<re4|IeZ&&ZWT2~C%41n)n0uH$2LS~1}LSkC87bXzAw zznP)kf{naw9>9$_j%`Tta@xVoxF5IRF^2GIUP3>IZak0e!fR()!K?FYc~Pb^&I`P( z@`B8ik+2s~{=XLXpsN|$I!b>*u9G#~dn0}**UJqI*l~PJCO@A1P^!C?02= z_7hOSV#y4>UKj!<-a3_yy$B5;9(hAdyWR|g}&po6n9A75)61^<) zQZM&;IPTyH{xu$sx8fUezdXP=?ZKzmXRRR$j^ha;dJS