Added XBRL

joeyism · Dec 15, 2019 · f8e1c5e · f8e1c5e
1 parent 04cafb4
commit f8e1c5e
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -33,6 +33,16 @@ edgar = Edgar()
 possible_companies = edgar.find_company_name("Cisco System")
 ```
 
+To get XBRL data, run
+```python
+from edgar import Company, XBRL, XBRLElement
+
+company = Company("Oracle Corp", "0001341439")
+results = company.get_data_files_from_10K("EX-101.INS", isxml=True)
+xbrl = XBRL(results[0])
+XBRLElement(xbrl.relevant_children_parsed[15]).to_dict() // returns a dictionary of name, value, and schemaRef
+```
+
 ## API
 
 ### Company
@@ -57,6 +67,24 @@ Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html)
     * ownership: defaults to include. Options are include, exclude, only.
     * no_of_entries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.
 
+##### get_10Ks
+Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of concatenation of all the documents in the 10-K
+* **Input**
+    * no_of_documents (default: 1): numer of documents to be retrieved
+
+##### get_document_type_from_10K
+Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of the document within 10-K
+* **Input**
+    * document_type: Tye type of document you want, i.e. 10-K, EX-3.2
+    * no_of_documents (default: 1): numer of documents to be retrieved
+
+##### get_data_files_from_10K
+Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of the data file within 10-K
+* **Input**
+    * document_type: Tye type of document you want, i.e. EX-101.INS
+    * no_of_documents (default: 1): numer of documents to be retrieved
+    * isxml (default: False): by default, things aren't case sensitive and is parsed with `html` in `lxml. If this is True, then it is parsed with `etree` which is case sensitive
+
 ### Edgar
 Gets all companies from EDGAR
 ##### get_cik_by_company_name
@@ -76,3 +104,11 @@ Returns a list of strings, each string contains the body of the specified docume
 * **Input**
     * tree: lxml.html form that is returned from Company.getAllFilings
     * no_of_documents: number of document returned. If it is 1, the returned result is just one string, instead of a list of strings. Defaults to 1.
+
+### XBRL
+Parses data from XBRL
+* `relevant_children`
+    * get children that are not `context`
+* `relevant_children_parsed`
+    * get children that are not `context`, `unit`, `schemaRef`
+    * cleans tags
diff --git a/edgar/__init__.py b/edgar/__init__.py
@@ -3,8 +3,9 @@
 from .edgar import Edgar
 from .txtml import TXTML
 from .company import Company
+from .xbrl import XBRL, XBRLElement
 
-__version__ = "4.0.0"
+__version__ = "4.1.0"
 
 modules = glob.glob(dirname(__file__)+"/*.py")
 __all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
diff --git a/edgar/company.py b/edgar/company.py
@@ -1,5 +1,5 @@
 import requests
-from lxml import html
+from lxml import html, etree
 
 BASE_URL = "https://www.sec.gov"
 
@@ -57,6 +57,23 @@ def get_document_type_from_10K(self, document_type, no_of_documents=1):
               result.append(doc)
       return result
 
+    def get_data_files_from_10K(self, document_type, no_of_documents=1, isxml=False):
+      tree = self.get_all_filings(filing_type="10-K")
+      url_groups = self._group_document_type(tree, "10-K")[:no_of_documents]
+      result = []
+      for url_group in url_groups:
+        for url in url_group:
+          url = BASE_URL + url
+          content_page = get_request(url)
+          table = content_page.find_class("tableFile")[1]
+          for row in table.getchildren():
+            if document_type in row.getchildren()[3].text:
+              href = row.getchildren()[2].getchildren()[0].attrib["href"]
+              href = BASE_URL + href
+              doc = get_request(href, isxml=isxml)
+              result.append(doc)
+      return result
+
     def get_10Ks(self, no_of_documents=1):
       tree = self.get_all_filings(filing_type="10-K")
       elems = tree.xpath('//*[@id="documentsbutton"]')[:no_of_documents]
@@ -76,9 +93,12 @@ def get_10K(self):
       return self.get_10Ks(no_of_documents=1)[0]
 
 
-def get_request(href):
+def get_request(href, isxml=False):
     page = requests.get(href)
-    return html.fromstring(page.content)
+    if isxml:
+      return etree.fromstring(page.content)
+    else:
+      return html.fromstring(page.content)
 
 def get_documents(tree, no_of_documents=1):
     BASE_URL = "https://www.sec.gov"

diff --git a/edgar/txtml.py b/edgar/txtml.py
@@ -1,3 +1,6 @@
+from lxml.etree import tostring
+import html
+
 class TXTML:
 
   @classmethod
@@ -28,3 +31,7 @@ def parse_full_10K(cls, doc):
       if properties['type'] == '10-K':
         text = text + html.text_content()
     return text
+
+  @classmethod
+  def to_xml(cls, doc):
+    return html.unescape(tostring(doc).decode("utf8"))
diff --git a/edgar/xbrl.py b/edgar/xbrl.py
@@ -0,0 +1,93 @@
+from typing import Dict, List
+import re
+from datetime import datetime
+from lxml import etree
+
+class XBRL(etree.ElementBase):
+  CONTEXT_REF_NOT_DATE = ["us-gaap", "srt", "dei"]
+
+  @classmethod
+  def clean_tag(cls, elem):
+    """
+    Parse tag so 
+      {http://fasb.org/us-gaap/2018-01-31}Assets
+    becomes
+      Assets
+    """
+    elem.tag = elem.tag[elem.tag.find("}")+1:]
+
+  @classmethod
+  def parse_context_ref(cls, context_ref):
+    """
+    Duration_1_1_2018_To_12_31_2018 becomes 2018-01-01 to 2018-12-31
+    As_Of_12_31_2017 becomes 2017-12-31
+    """
+    context_ref_to_date_text = lambda s: datetime.strptime(s, "%m_%d_%Y").date().strftime("%Y-%m-%d")
+    if context_ref.startswith("Duration"):
+      if not any([val in context_ref for val in cls.CONTEXT_REF_NOT_DATE]):
+        from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
+        to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:])
+        return {"from": from_date, "to": to_date}
+      else:
+        from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
+        end_idx = min([context_ref.find(val) for val in cls.CONTEXT_REF_NOT_DATE if context_ref.find(val) > -1])
+        to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:end_idx-1])
+        return {"from": from_date, "to": to_date}
+
+    elif context_ref.startswith("As_Of"):
+      if not any([val in context_ref for val in cls.CONTEXT_REF_NOT_DATE]):
+        return {"from": context_ref_to_date_text(context_ref[len("As_Of")+1:])}
+      else:
+        end_idx = min([context_ref.find(val) for val in cls.CONTEXT_REF_NOT_DATE if context_ref.find(val) > -1])
+        from_date = context_ref_to_date_text(context_ref[len("As_Of")+1:end_idx-1])
+        return {"from": from_date}
+
+  @property
+  def child(self):
+    return self.getchildren()[0]
+
+  @property
+  def relevant_children(self):
+    """
+    Get children that are not `context`
+    """
+    return [child for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" not in child.tag]
+
+  @property
+  def relevant_children_parsed(self):
+    """
+    Get children that are not `context`, `unit`, `schemaRef`
+    """
+    children = [child for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" not in child.tag and "unit" not in child.tag and "schemaRef" not in child.tag]
+    for elem in children:
+      XBRL.clean_tag(elem)
+    return children
+
+class XBRLElement(etree.ElementBase):
+
+  @property
+  def child(self):
+    return self.getchildren()[0]
+
+  @property
+  def attrib(self) -> Dict:
+    return self.child.attrib
+
+  @property
+  def context_ref(self) -> Dict:
+    return XBRL.parse_context_ref(self.attrib["contextRef"]) if self.attrib.get("contextRef") else {}
+
+  @property
+  def name(self):
+    return ' '.join(re.findall('[A-Z][^A-Z]*', self.child.tag))
+
+  @property
+  def value(self) -> str:
+    return self.child.text
+
+  def to_dict(self) -> Dict:
+    return {
+      "name": self.name,
+      "value": self.value,
+      "context_ref": self.context_ref
+    }