misja · kanarinka · Dec 4, 2013 · Jan 7, 2014 · Jan 9, 2014
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+
+boilerpipe-1.2.0-bin.tar.gz
+
+*.jar
+
+build/lib/boilerpipe/__init__.py
+
+build/lib/boilerpipe/extract/__init__.py
diff --git a/README.rst b/README.rst
@@ -43,4 +43,6 @@ Then, to extract relevant content:
 
 	extracted_html = extractor.getHTML()
 
+	extracted_title = extractor.getTitle()
+
 .. _Boilerpipe: http://code.google.com/p/boilerpipe/ 
diff --git a/src/boilerpipe/extract/__init__.py b/src/boilerpipe/extract/__init__.py
@@ -1,5 +1,5 @@
 import jpype
-import urllib2
+import requests
 import socket
 import charade
 import threading
@@ -29,16 +29,12 @@ class Extractor(object):
     source    = None
     data      = None
     headers   = {'User-Agent': 'Mozilla/5.0'}
-    
+
     def __init__(self, extractor='DefaultExtractor', **kwargs):
         if kwargs.get('url'):
-            request     = urllib2.Request(kwargs['url'], headers=self.headers)
-            connection  = urllib2.urlopen(request)
-            self.data   = connection.read()
-            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
-            if encoding.lower() == 'text/html':
-                encoding = charade.detect(self.data)['encoding']
-            self.data = unicode(self.data, encoding)
+            response = requests.request('GET', kwargs['url'], headers=self.headers, timeout=10)
+            self.data = response.text
+
         elif kwargs.get('html'):
             self.data = kwargs['html']
             if not isinstance(self.data, unicode):
@@ -68,6 +64,9 @@ def getText(self):
     def getHTML(self):
         highlighter = HTMLHighlighter.newExtractingInstance()
         return highlighter.process(self.source, self.data)
+
+    def getTitle(self):
+        return self.source.getTitle()
 
     def getImages(self):
         extractor = jpype.JClass(
Original file line number	Diff line number	Diff line change
Expand Up		@@ -43,4 +43,6 @@ Then, to extract relevant content:

		extracted_html = extractor.getHTML()

		extracted_title = extractor.getTitle()

		.. _Boilerpipe: http://code.google.com/p/boilerpipe/