Skip to content

Commit

Permalink
Added XBRL
Browse files Browse the repository at this point in the history
  • Loading branch information
joeyism committed Dec 15, 2019
1 parent 04cafb4 commit f8e1c5e
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 4 deletions.
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ edgar = Edgar()
possible_companies = edgar.find_company_name("Cisco System")
```

To get XBRL data, run
```python
from edgar import Company, XBRL, XBRLElement

company = Company("Oracle Corp", "0001341439")
results = company.get_data_files_from_10K("EX-101.INS", isxml=True)
xbrl = XBRL(results[0])
XBRLElement(xbrl.relevant_children_parsed[15]).to_dict() // returns a dictionary of name, value, and schemaRef
```

## API

### Company
Expand All @@ -57,6 +67,24 @@ Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html)
* ownership: defaults to include. Options are include, exclude, only.
* no_of_entries: defaults to 100. Returns the number of entries to be returned. Maximum is 100.

##### get_10Ks
Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of concatenation of all the documents in the 10-K
* **Input**
* no_of_documents (default: 1): numer of documents to be retrieved

##### get_document_type_from_10K
Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of the document within 10-K
* **Input**
* document_type: Tye type of document you want, i.e. 10-K, EX-3.2
* no_of_documents (default: 1): numer of documents to be retrieved

##### get_data_files_from_10K
Returns the HTML in the form of [lxml.html](http://lxml.de/lxmlhtml.html) of the data file within 10-K
* **Input**
* document_type: Tye type of document you want, i.e. EX-101.INS
* no_of_documents (default: 1): numer of documents to be retrieved
* isxml (default: False): by default, things aren't case sensitive and is parsed with `html` in `lxml. If this is True, then it is parsed with `etree` which is case sensitive

### Edgar
Gets all companies from EDGAR
##### get_cik_by_company_name
Expand All @@ -76,3 +104,11 @@ Returns a list of strings, each string contains the body of the specified docume
* **Input**
* tree: lxml.html form that is returned from Company.getAllFilings
* no_of_documents: number of document returned. If it is 1, the returned result is just one string, instead of a list of strings. Defaults to 1.

### XBRL
Parses data from XBRL
* `relevant_children`
* get children that are not `context`
* `relevant_children_parsed`
* get children that are not `context`, `unit`, `schemaRef`
* cleans tags
3 changes: 2 additions & 1 deletion edgar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from .edgar import Edgar
from .txtml import TXTML
from .company import Company
from .xbrl import XBRL, XBRLElement

__version__ = "4.0.0"
__version__ = "4.1.0"

modules = glob.glob(dirname(__file__)+"/*.py")
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
26 changes: 23 additions & 3 deletions edgar/company.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import requests
from lxml import html
from lxml import html, etree

BASE_URL = "https://www.sec.gov"

Expand Down Expand Up @@ -57,6 +57,23 @@ def get_document_type_from_10K(self, document_type, no_of_documents=1):
result.append(doc)
return result

def get_data_files_from_10K(self, document_type, no_of_documents=1, isxml=False):
tree = self.get_all_filings(filing_type="10-K")
url_groups = self._group_document_type(tree, "10-K")[:no_of_documents]
result = []
for url_group in url_groups:
for url in url_group:
url = BASE_URL + url
content_page = get_request(url)
table = content_page.find_class("tableFile")[1]
for row in table.getchildren():
if document_type in row.getchildren()[3].text:
href = row.getchildren()[2].getchildren()[0].attrib["href"]
href = BASE_URL + href
doc = get_request(href, isxml=isxml)
result.append(doc)
return result

def get_10Ks(self, no_of_documents=1):
tree = self.get_all_filings(filing_type="10-K")
elems = tree.xpath('//*[@id="documentsbutton"]')[:no_of_documents]
Expand All @@ -76,9 +93,12 @@ def get_10K(self):
return self.get_10Ks(no_of_documents=1)[0]


def get_request(href):
def get_request(href, isxml=False):
page = requests.get(href)
return html.fromstring(page.content)
if isxml:
return etree.fromstring(page.content)
else:
return html.fromstring(page.content)

def get_documents(tree, no_of_documents=1):
BASE_URL = "https://www.sec.gov"
Expand Down
7 changes: 7 additions & 0 deletions edgar/txtml.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from lxml.etree import tostring
import html

class TXTML:

@classmethod
Expand Down Expand Up @@ -28,3 +31,7 @@ def parse_full_10K(cls, doc):
if properties['type'] == '10-K':
text = text + html.text_content()
return text

@classmethod
def to_xml(cls, doc):
return html.unescape(tostring(doc).decode("utf8"))
93 changes: 93 additions & 0 deletions edgar/xbrl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from typing import Dict, List
import re
from datetime import datetime
from lxml import etree

class XBRL(etree.ElementBase):
CONTEXT_REF_NOT_DATE = ["us-gaap", "srt", "dei"]

@classmethod
def clean_tag(cls, elem):
"""
Parse tag so
{http://fasb.org/us-gaap/2018-01-31}Assets
becomes
Assets
"""
elem.tag = elem.tag[elem.tag.find("}")+1:]

@classmethod
def parse_context_ref(cls, context_ref):
"""
Duration_1_1_2018_To_12_31_2018 becomes 2018-01-01 to 2018-12-31
As_Of_12_31_2017 becomes 2017-12-31
"""
context_ref_to_date_text = lambda s: datetime.strptime(s, "%m_%d_%Y").date().strftime("%Y-%m-%d")
if context_ref.startswith("Duration"):
if not any([val in context_ref for val in cls.CONTEXT_REF_NOT_DATE]):
from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:])
return {"from": from_date, "to": to_date}
else:
from_date = context_ref_to_date_text(context_ref[len("DURATION")+1:context_ref.find("_To_")])
end_idx = min([context_ref.find(val) for val in cls.CONTEXT_REF_NOT_DATE if context_ref.find(val) > -1])
to_date = context_ref_to_date_text(context_ref[context_ref.find("_To_")+4:end_idx-1])
return {"from": from_date, "to": to_date}

elif context_ref.startswith("As_Of"):
if not any([val in context_ref for val in cls.CONTEXT_REF_NOT_DATE]):
return {"from": context_ref_to_date_text(context_ref[len("As_Of")+1:])}
else:
end_idx = min([context_ref.find(val) for val in cls.CONTEXT_REF_NOT_DATE if context_ref.find(val) > -1])
from_date = context_ref_to_date_text(context_ref[len("As_Of")+1:end_idx-1])
return {"from": from_date}

@property
def child(self):
return self.getchildren()[0]

@property
def relevant_children(self):
"""
Get children that are not `context`
"""
return [child for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" not in child.tag]

@property
def relevant_children_parsed(self):
"""
Get children that are not `context`, `unit`, `schemaRef`
"""
children = [child for child in self.child.getchildren() if not isinstance(child, etree._Comment) and "context" not in child.tag and "unit" not in child.tag and "schemaRef" not in child.tag]
for elem in children:
XBRL.clean_tag(elem)
return children

class XBRLElement(etree.ElementBase):

@property
def child(self):
return self.getchildren()[0]

@property
def attrib(self) -> Dict:
return self.child.attrib

@property
def context_ref(self) -> Dict:
return XBRL.parse_context_ref(self.attrib["contextRef"]) if self.attrib.get("contextRef") else {}

@property
def name(self):
return ' '.join(re.findall('[A-Z][^A-Z]*', self.child.tag))

@property
def value(self) -> str:
return self.child.text

def to_dict(self) -> Dict:
return {
"name": self.name,
"value": self.value,
"context_ref": self.context_ref
}

0 comments on commit f8e1c5e

Please sign in to comment.