Skip to content

Commit

Permalink
Merge pull request karussell#42 from skyshard/abhishek/fix_author_con…
Browse files Browse the repository at this point in the history
…tent_extraction

Fix extraction issues
  • Loading branch information
abhishek-airpr authored Jun 22, 2017
2 parents 3c7f2e5 + 4297516 commit 2a917e7
Show file tree
Hide file tree
Showing 17 changed files with 20,993 additions and 13 deletions.
1,299 changes: 1,299 additions & 0 deletions reuters.html

Large diffs are not rendered by default.

90 changes: 78 additions & 12 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,16 @@ public boolean hasHTMLTags(String text){

private static final Pattern NEGATIVE_STYLE =
Pattern.compile("hidden|display: ?none|font-size: ?small");
private static final String ELLIPSIS_PATTERN = "…|\\.\\.\\.";
private static final Pattern[] IGNORE_AUTHOR_PARTS = new Pattern[]{
// Deliberately keeping patterns separate to make is more readable and maintainable

// Remove the Prefixes
Pattern.compile("(?<![a-zA-Z])(Door|Über|by|name|author|posted|twitter|handle|news|locally researched)(?![a-zA-Z])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS),
Pattern.compile("(?<![a-zA-Z])(Door|Über|by|name|author|posted|twitter|handle|news|locally researched|report(ing|ed)?( by)?|edit(ing|ed)( by)?)(?![a-zA-Z])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CHARACTER_CLASS),
// Remove the Suffixes
Pattern.compile("((\\|| - |, ).*)"),
// Remove any arbitrary special symbols
Pattern.compile("@|:")
Pattern.compile("(" + "@|:|\\(|\\)|" + ELLIPSIS_PATTERN + ")+"),
};
private static final Set<String> IGNORED_TITLE_PARTS = new LinkedHashSet<String>() {
{
Expand Down Expand Up @@ -152,7 +153,9 @@ public boolean hasHTMLTags(String text){
));
aMap.put("www.reuters.com", Arrays.asList(
"*[class=section main-content]", // odd case the "section main-content" class actually contains only the title.
"div[id=specialFeature]" // remove non-article section
"div[id=specialFeature]", // remove non-article section
"div.next-articles",
"span.articleLocation"
));
aMap.put("investors.com", Arrays.asList(
"*[class=special-report]",
Expand Down Expand Up @@ -206,6 +209,12 @@ public boolean hasHTMLTags(String text){
aMap.put("teenvogue.com", Arrays.asList(
"[class=rendition-social-outer]"
));
aMap.put("philly.com", Arrays.asList(
"[class=pad-and-half--top cb]"
));
aMap.put("foxnews.com", Arrays.asList(
"p:contains(RELATED:) ~ ul"
));

NODES_TO_REMOVE_PER_DOMAIN = Collections.unmodifiableMap(aMap);
}
Expand Down Expand Up @@ -341,7 +350,7 @@ public ArticleTextExtractor() {
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|title|truncate|slider|^sectioncolumns$|ad-container");
setHighlyNegative("policy-blk|followlinkedinsignin|^signupbox$");
setToRemove("feedback-prompt|story-footer|story-meta-footer|related-combined-coverage|visuallyhidden|ad_topjobs|slideshow-overlay__data|next-post-thumbnails|video-desc|related-links|^widget popular$|^widget marketplace$|^widget ad panel$|slideshowOverlay|^share-twitter$|^share-facebook$|^share-google-plus-1$|^inline-list tags$|^tag_title$|article_meta comments|^related-news$|^recomended$|^news_preview$|related--galleries|image-copyright--copyright|^credits$|^photocredit$|^morefromcategory$|^pag-photo-credit$|gallery-viewport-credit|^image-credit$|story-secondary$|carousel-body|slider_container|widget_stories|post-thumbs|^custom-share-links|socialTools|trendingStories|^metaArticleData$|jcarousel-container|module-video-slider|jcarousel-skin-tango|^most-read-content$|^commentBox$|^faqModal$|^widget-area|login-panel|^copyright$|relatedSidebar|shareFooterCntr|most-read-container|email-signup|outbrain|^wnStoryBodyGraphic|articleadditionalcontent|most-popular|shatner-box|form-errors|theme-summary|story-supplement|global-magazine-recent|nocontent");
setToRemove("feedback-prompt|story-footer|story-meta-footer|related-combined-coverage|visuallyhidden|ad_topjobs|slideshow-overlay__data|next-post-thumbnails|video-desc|related-links|^widget popular$|^widget marketplace$|^widget ad panel$|slideshowOverlay|^share-twitter$|^share-facebook$|^share-google-plus-1$|^inline-list tags$|^tag_title$|article_meta comments|^related-news$|^recomended$|^news_preview$|related--galleries|image-copyright--copyright|^credits$|^photocredit$|^morefromcategory$|^pag-photo-credit$|gallery-viewport-credit|^image-credit$|story-secondary$|carousel-body|slider_container|widget_stories|post-thumbs|^custom-share-links|socialTools|trendingStories|^metaArticleData$|jcarousel-container|module-video-slider|jcarousel-skin-tango|^most-read-content$|^commentBox$|^faqModal$|^widget-area|login-panel|^copyright$|relatedSidebar|shareFooterCntr|most-read-container|email-signup|outbrain|^wnStoryBodyGraphic|articleadditionalcontent|most-popular|shatner-box|form-errors|theme-summary|story-supplement|global-magazine-recent|nocontent|hidden-print|externallinks");
}

public ArticleTextExtractor setUnlikely(String unlikelyStr) {
Expand Down Expand Up @@ -2030,6 +2039,14 @@ protected String extractAuthorName(Document doc) {
}
}

if (authorName.isEmpty()) { // http://sdn.cioreview.com/cxoinsight/sdn-do-you-really-need-it-nid-24422-cid-147.html
result = doc.select("div#namepost").first();
if (result != null) {
authorName = SHelper.innerTrim(result.text().split(",")[0]);
if(DEBUG_AUTHOR_EXTRACTION && !authorName.isEmpty()) System.out.println("AUTHOR: div#namepost");
}
}

// meta tag approaches, get content
if (authorName.isEmpty()) {
result = doc.select("head meta[name=author]").first();
Expand All @@ -2045,11 +2062,21 @@ protected String extractAuthorName(Document doc) {
}

if (authorName.isEmpty()) { // for "schema.org creativework"
result = doc.select("[itemtype$=schema.org/Person]span[itemprop=author], [itemtype$=schema.org/Person]span[itemprop=name], [itemtype$=schema.org/Organization] span[itemprop=name]").first();
result = doc.select("[itemtype$=schema.org/Person] [itemprop=author], [itemtype$=schema.org/Person] [itemprop=name]").first();
if (result != null) {
authorName = SHelper.innerTrim(result.text());
if(DEBUG_AUTHOR_EXTRACTION && !authorName.isEmpty()) System.out.println("AUTHOR: for \"schema.org creativework\" [itemtype$=schema.org/Person] [itemprop=author], [itemtype$=schema.org/Person] [itemprop=name]");
}
}

// Separating out checks for Person and Organization so that we should pick
// [itemtype$=schema.org/Person] over [itemtype$=schema.org/Organization] in case both are present
if (authorName.isEmpty()) { // for "schema.org creativework"
result = doc.select("[itemtype$=schema.org/Organization] [itemprop=name]").first();
if (result != null) {
authorName = SHelper.innerTrim(result.text());
if(DEBUG_AUTHOR_EXTRACTION && !authorName.isEmpty()) System.out.println("AUTHOR: for \"schema.org creativework\" [itemtype$=schema.org/Organization] [itemprop=name]");
}
if(DEBUG_AUTHOR_EXTRACTION && !authorName.isEmpty()) System.out.println("AUTHOR: for \"schema.org creativework\" [itemtype$=schema.org/Person] span[itemprop=author], [itemtype$=schema.org/Person] span[itemprop=name]");
}

// globalbankingandfinance.com
Expand Down Expand Up @@ -2100,7 +2127,7 @@ protected String extractAuthorName(Document doc) {
result = doc.select("span[class=author],span[class=authorname],span[class=author-name],span[class=author_name]," +
"span[class=article-author-name],span[class=article_author_name]").first();
if (result != null) {
authorName = SHelper.innerTrim(result.ownText());
authorName = SHelper.innerTrim(result.text());
if(DEBUG_AUTHOR_EXTRACTION && !authorName.isEmpty()) System.out.println("AUTHOR: Generic check for class name having author");
}
}
Expand Down Expand Up @@ -2205,12 +2232,35 @@ protected String extractAuthorName(Document doc) {
if(DEBUG_AUTHOR_EXTRACTION && matches!=null && matches.size()>0) System.out.println("AUTHOR: address[class*=byline]");
}

// http://www.nydailynews.com/newswires/sports/kershaw-wins-bellinger-hits-2-homers-dodgers-top-indians-article-1.3245615
if (matches == null || matches.size() == 0){
matches = doc.select("div[itemtype$=schema.org/Person]");
if(DEBUG_AUTHOR_EXTRACTION && matches!=null && matches.size()>0) System.out.println("AUTHOR: div[itemtype$=schema.org/Person]");
}

// http://www.upi.com/Entertainment_News/TV/2017/06/19/Star-Trek-Discovery-gets-a-premiere-date-Sept-24/8671497893578/
// http://www.upi.com/Defense-News/2017/06/19/King-Aerospace-recieves-EO-5-aircraft-contract/2311497885914/
if (matches == null || matches.size() == 0) {
matches = doc.select("div.meta");
if (DEBUG_AUTHOR_EXTRACTION && matches != null && matches.size() > 0)
System.out.println("AUTHOR: div.meta");
}

// Regex match should be very last option in cases like
// http://www.reuters.com/article/us-mexico-oil-ninth-idUSKBN19A2M9
// http://www.reuters.com/article/us-safrica-mining-idUSKBN19A2PY
if (matches == null || matches.size() == 0) {
matches = doc.select(":containsOwn(reporting by), :containsOwn(reported by), :containsOwn(edited by), :containsOwn(editing by)");
if (DEBUG_AUTHOR_EXTRACTION && matches != null && matches.size() > 0)
System.out.println("AUTHOR: :containsOwn(reporting by), :containsOwn(reported by), :containsOwn(edited by), :containsOwn(editing by)");
}

// select the best element from them
if(matches != null){
if (matches != null) {
Element bestMatch = getBestMatchElement(matches);
if(!(bestMatch == null)) {
if (!(bestMatch == null)) {
authorName = bestMatch.text();
if(authorName.length() < MIN_AUTHOR_NAME_LENGTH){
if (authorName.length() < MIN_AUTHOR_NAME_LENGTH) {
authorName = bestMatch.text().split(",")[0];
}
}
Expand All @@ -2225,6 +2275,11 @@ protected String extractAuthorName(Document doc) {
System.out.println("AUTHOR: authorName=" + authorName);
}

// Remove date patterns if any
for (Pattern pattern:DATE_PATTERNS) {
authorName = pattern.matcher(authorName).replaceAll("");
}

for (Pattern pattern:IGNORE_AUTHOR_PARTS) {
authorName = pattern.matcher(authorName).replaceAll("");
}
Expand Down Expand Up @@ -2474,16 +2529,27 @@ protected String extractAuthorDescription(Document doc, String authorName){
return SHelper.innerTrim(matches.first().attr("href"));
}

// http://www.nydailynews.com/newswires/sports/kershaw-wins-bellinger-hits-2-homers-dodgers-top-indians-article-1.3245615
matches = doc.select("div[class=ra-credits]");
if (matches == null || matches.size() > 0){
authorDesc = SHelper.innerTrim(matches.first().ownText());
if(DEBUG_AUTHOR_DESC_EXTRACTION){
System.out.println("AUTHOR_DESC: div[class=ra-credits].ownText");
System.out.println("AUTHOR: AUTHOR_DESC=" + authorDesc);
}
return authorDesc;
}

try {
// If not author desc found, try to found a section where the author name
// is defined.
authorName = authorName.trim();
if(authorName.length()>8){
Elements nodes = doc.select(":containsOwn(" + authorName + ")");
Element bestMatch = getBestMatchElement(nodes);
if (bestMatch != null){
if (bestMatch != null) {
authorDesc = bestMatch.text();
if(DEBUG_AUTHOR_DESC_EXTRACTION){
if (DEBUG_AUTHOR_DESC_EXTRACTION) {
System.out.println("AUTHOR_DESC: containsOwn");
System.out.println("AUTHOR: AUTHOR_DESC=" + authorDesc);
}
Expand Down
Loading

0 comments on commit 2a917e7

Please sign in to comment.