Skip to content

Commit

Permalink
Merge pull request karussell#39 from skyshard/abhishek/CRAW-189_teenv…
Browse files Browse the repository at this point in the history
…ogue_incomplete_extraction

CRAW-189: Incomplete extraction of thevogue.com
  • Loading branch information
andresp99999 authored Jun 12, 2017
2 parents 91da6c7 + ce539b4 commit 56d2a00
Show file tree
Hide file tree
Showing 3 changed files with 295 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,9 @@ public boolean hasHTMLTags(String text){
aMap.put("nytimes.com", Arrays.asList(
"[class*=hidden]"
));
aMap.put("teenvogue.com", Arrays.asList(
"[class=rendition-social-outer]"
));

NODES_TO_REMOVE_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down Expand Up @@ -249,6 +252,9 @@ public boolean hasHTMLTags(String text){
aMap.put("sfchronicle.com", Arrays.asList(
"div[class=article-text]"
));
aMap.put("teenvogue.com", Arrays.asList(
"div[class=listicle-wrapper]"
));

BEST_ELEMENT_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down
20 changes: 20 additions & 0 deletions src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3110,6 +3110,26 @@ public void testSfchronicle() throws Exception {
compareDates("2015-09-30 00:00:00", res.getDate());
}

@Test
public void testTheVogue() throws Exception {
// http://www.teenvogue.com/gallery/back-to-school-awards-2017-best-dorm-decor-ideas
JResult res = new JResult();
res.setUrl("http://www.teenvogue.com/gallery/back-to-school-awards-2017-best-dorm-decor-ideas");
res = extractor.extractContent(res, c.streamToString(getClass().getResourceAsStream("thevogue.html")));
assertEquals("http://www.teenvogue.com/gallery/back-to-school-awards-2017-best-dorm-decor-ideas", res.getCanonicalUrl());
assertEquals("Back to School Awards 2017: The Best Dorm Decor Ideas", res.getTitle());
assertTrue(res.getText(), res.getText().startsWith("This year for our second annual Back to School Awards, we wanted to make sure you were totally covered"));
assertTrue(res.getText(), res.getText().contains("Best Starter Kitchen Set"));
assertTrue(res.getText(), res.getText().contains("Best Room Spray"));
assertTrue(res.getText(), res.getText().contains("Best Value Agenda"));
assertTrue(res.getText(), res.getText().contains("Best USB Extendor"));
assertTrue(res.getText(), res.getText().endsWith("Want more Teen Vogue ? Make sure to ‘Like’ us on Facebook to stay in the know!"));
assertEquals("Hanna Howard", res.getAuthorName());
assertEquals("Hanna Howard", res.getAuthorDescription());
compareDates("2017-06-01 08:00:00", res.getDate());
}


public static void compareDates(String expectedDateString, Date actual) {
String[] patterns = {
"yyyy-MM-dd",
Expand Down
Loading

0 comments on commit 56d2a00

Please sign in to comment.