diff --git a/.gitconfig b/.gitconfig deleted file mode 100644 index 4f7288d..0000000 --- a/.gitconfig +++ /dev/null @@ -1,16 +0,0 @@ -# Generated by nbdev_install_git_hooks -# -# If you need to disable this instrumentation do: -# git config --local --unset include.path -# -# To restore the filter -# git config --local include.path .gitconfig -# -# If you see notebooks not stripped, checked the filters are applied in .gitattributes -# -[filter "clean-nbs"] - clean = nbdev_clean_nbs --read_input_stream True - smudge = cat - required = true -[diff "ipynb"] - textconv = nbdev_clean_nbs --disp True --fname diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml new file mode 100644 index 0000000..29bfc57 --- /dev/null +++ b/.github/workflows/deploy.yaml @@ -0,0 +1,14 @@ +name: Deploy to GitHub Pages + +permissions: + contents: write + pages: write + +on: + push: + branches: [ "main", "master" ] + workflow_dispatch: +jobs: + deploy: + runs-on: ubuntu-latest + steps: [uses: fastai/workflows/quarto-ghp@master] diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 6792773..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: CI -on: [push, pull_request] -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - uses: actions/setup-python@v1 - with: - python-version: '3.6' - architecture: 'x64' - - name: Install the library - run: | - pip install nbdev jupyter - #pip install -e . - - name: Read all notebooks - run: | - nbdev_read_nbs - - name: Check if all notebooks are cleaned - run: | - echo "Check we are starting with clean git checkout" - if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi - echo "Trying to strip out notebooks" - nbdev_clean_nbs - echo "Check that strip out was unnecessary" - git status -s # display the status to see which nbs need cleaning up - if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi - - name: Check if there is no diff library/notebooks - run: | - if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi - #- name: Run tests - #run: | - #nbdev_test_nbs diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml new file mode 100644 index 0000000..5608592 --- /dev/null +++ b/.github/workflows/test.yaml @@ -0,0 +1,7 @@ +name: CI +on: [workflow_dispatch, pull_request, push] + +jobs: + test: + runs-on: ubuntu-latest + steps: [uses: fastai/workflows/nbdev-ci@master] diff --git a/.gitignore b/.gitignore index b6e4761..596dafa 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ +_docs/ +_proc/ + diff --git a/Makefile b/Makefile deleted file mode 100644 index cdaa956..0000000 --- a/Makefile +++ /dev/null @@ -1,37 +0,0 @@ -.ONESHELL: -SHELL := /bin/bash -SRC = $(wildcard nbs/*.ipynb) - -all: causalnlp docs - -causalnlp: $(SRC) - nbdev_build_lib - touch causalnlp - -sync: - nbdev_update_lib - -docs_serve: docs - cd docs && bundle exec jekyll serve - -docs: $(SRC) - nbdev_build_docs - touch docs - -test: - nbdev_test_nbs - -release: pypi conda_release - nbdev_bump_version - -conda_release: - fastrelease_conda_package - -pypi: dist - twine upload --repository pypi dist/* - -dist: clean - python setup.py sdist bdist_wheel - -clean: - rm -rf dist diff --git a/causalnlp/_modidx.py b/causalnlp/_modidx.py new file mode 100644 index 0000000..d825bfa --- /dev/null +++ b/causalnlp/_modidx.py @@ -0,0 +1,450 @@ +# Autogenerated by nbdev + +d = { 'settings': { 'branch': 'main', + 'doc_baseurl': '/causalnlp/', + 'doc_host': 'https://amaiya.github.io', + 'git_url': 'https://github.com/amaiya/causalnlp/tree/main/', + 'lib_path': 'causalnlp'}, + 'syms': { 'causalnlp.analyzers': { 'causalnlp.analyzers.TextEncoder': ('analyzers.html#textencoder', 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TextEncoder.__init__': ( 'analyzers.html#textencoder.__init__', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TextEncoder.encode': ( 'analyzers.html#textencoder.encode', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel': ('analyzers.html#topicmodel', 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.__init__': ( 'analyzers.html#topicmodel.__init__', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel._check_build': ( 'analyzers.html#topicmodel._check_build', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel._check_model': ( 'analyzers.html#topicmodel._check_model', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel._rank_documents': ( 'analyzers.html#topicmodel._rank_documents', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.build': ('analyzers.html#topicmodel.build', 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_docs': ( 'analyzers.html#topicmodel.get_docs', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_doctopics': ( 'analyzers.html#topicmodel.get_doctopics', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_document_topic_distribution': ( 'analyzers.html#topicmodel.get_document_topic_distribution', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_sorted_docs': ( 'analyzers.html#topicmodel.get_sorted_docs', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_topics': ( 'analyzers.html#topicmodel.get_topics', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.get_word_weights': ( 'analyzers.html#topicmodel.get_word_weights', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.predict': ( 'analyzers.html#topicmodel.predict', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.print_topics': ( 'analyzers.html#topicmodel.print_topics', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.topics': ( 'analyzers.html#topicmodel.topics', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.TopicModel.train': ('analyzers.html#topicmodel.train', 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.ZeroShotClassifier': ( 'analyzers.html#zeroshotclassifier', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.ZeroShotClassifier.__init__': ( 'analyzers.html#zeroshotclassifier.__init__', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.ZeroShotClassifier.predict': ( 'analyzers.html#zeroshotclassifier.predict', + 'causalnlp/analyzers.py'), + 'causalnlp.analyzers.list2chunks': ('analyzers.html#list2chunks', 'causalnlp/analyzers.py')}, + 'causalnlp.autocoder': { 'causalnlp.autocoder.Autocoder': ('autocoder.html#autocoder', 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.__init__': ( 'autocoder.html#autocoder.__init__', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder._binarize_df': ( 'autocoder.html#autocoder._binarize_df', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder._check_columns': ( 'autocoder.html#autocoder._check_columns', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder._format_to_df': ( 'autocoder.html#autocoder._format_to_df', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_callable': ( 'autocoder.html#autocoder.code_callable', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_custom_topics': ( 'autocoder.html#autocoder.code_custom_topics', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_emotion': ( 'autocoder.html#autocoder.code_emotion', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_lda_topics': ( 'autocoder.html#autocoder.code_lda_topics', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_sentiment': ( 'autocoder.html#autocoder.code_sentiment', + 'causalnlp/autocoder.py'), + 'causalnlp.autocoder.Autocoder.code_transformer': ( 'autocoder.html#autocoder.code_transformer', + 'causalnlp/autocoder.py')}, + 'causalnlp.core.causalbert': { 'causalnlp.core.causalbert.CausalBert': ( 'core.causalbert.html#causalbert', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBert.__init__': ( 'core.causalbert.html#causalbert.__init__', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBert.forward': ( 'core.causalbert.html#causalbert.forward', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel': ( 'core.causalbert.html#causalbertmodel', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel.__init__': ( 'core.causalbert.html#causalbertmodel.__init__', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel.build_dataloader': ( 'core.causalbert.html#causalbertmodel.build_dataloader', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel.estimate_ate': ( 'core.causalbert.html#causalbertmodel.estimate_ate', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel.inference': ( 'core.causalbert.html#causalbertmodel.inference', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.CausalBertModel.train': ( 'core.causalbert.html#causalbertmodel.train', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.gelu': ('core.causalbert.html#gelu', 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.make_bow_vector': ( 'core.causalbert.html#make_bow_vector', + 'causalnlp/core/causalbert.py'), + 'causalnlp.core.causalbert.platt_scale': ( 'core.causalbert.html#platt_scale', + 'causalnlp/core/causalbert.py')}, + 'causalnlp.core.causalinference': { 'causalnlp.core.causalinference.CausalInferenceModel': ( 'core.causalinference.html#causalinferencemodel', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.__init__': ( 'core.causalinference.html#causalinferencemodel.__init__', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel._balance': ( 'core.causalinference.html#causalinferencemodel._balance', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel._create_metalearner': ( 'core.causalinference.html#causalinferencemodel._create_metalearner', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel._predict': ( 'core.causalinference.html#causalinferencemodel._predict', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel._predict_shap': ( 'core.causalinference.html#causalinferencemodel._predict_shap', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.compute_propensity_scores': ( 'core.causalinference.html#causalinferencemodel.compute_propensity_scores', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.estimate_ate': ( 'core.causalinference.html#causalinferencemodel.estimate_ate', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.evaluate_robustness': ( 'core.causalinference.html#causalinferencemodel.evaluate_robustness', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.explain': ( 'core.causalinference.html#causalinferencemodel.explain', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.fit': ( 'core.causalinference.html#causalinferencemodel.fit', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.get_required_columns': ( 'core.causalinference.html#causalinferencemodel.get_required_columns', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.interpret': ( 'core.causalinference.html#causalinferencemodel.interpret', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.predict': ( 'core.causalinference.html#causalinferencemodel.predict', + 'causalnlp/core/causalinference.py'), + 'causalnlp.core.causalinference.CausalInferenceModel.tune_and_use_default_learner': ( 'core.causalinference.html#causalinferencemodel.tune_and_use_default_learner', + 'causalnlp/core/causalinference.py')}, + 'causalnlp.key_driver_analysis': { 'causalnlp.key_driver_analysis.KeyDriverAnalysis': ( 'key_driver_analysis.html#keydriveranalysis', + 'causalnlp/key_driver_analysis.py'), + 'causalnlp.key_driver_analysis.KeyDriverAnalysis.__init__': ( 'key_driver_analysis.html#keydriveranalysis.__init__', + 'causalnlp/key_driver_analysis.py'), + 'causalnlp.key_driver_analysis.KeyDriverAnalysis._preprocess': ( 'key_driver_analysis.html#keydriveranalysis._preprocess', + 'causalnlp/key_driver_analysis.py'), + 'causalnlp.key_driver_analysis.KeyDriverAnalysis.correlations': ( 'key_driver_analysis.html#keydriveranalysis.correlations', + 'causalnlp/key_driver_analysis.py'), + 'causalnlp.key_driver_analysis.KeyDriverAnalysis.importances': ( 'key_driver_analysis.html#keydriveranalysis.importances', + 'causalnlp/key_driver_analysis.py')}, + 'causalnlp.meta.base': { 'causalnlp.meta.base.BaseLearner': ('meta.base.html#baselearner', 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner._format_p': ( 'meta.base.html#baselearner._format_p', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner._set_propensity_models': ( 'meta.base.html#baselearner._set_propensity_models', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.bootstrap': ( 'meta.base.html#baselearner.bootstrap', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.estimate_ate': ( 'meta.base.html#baselearner.estimate_ate', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.fit': ('meta.base.html#baselearner.fit', 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.fit_predict': ( 'meta.base.html#baselearner.fit_predict', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.get_importance': ( 'meta.base.html#baselearner.get_importance', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.get_shap_values': ( 'meta.base.html#baselearner.get_shap_values', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.plot_importance': ( 'meta.base.html#baselearner.plot_importance', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.plot_shap_dependence': ( 'meta.base.html#baselearner.plot_shap_dependence', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.plot_shap_values': ( 'meta.base.html#baselearner.plot_shap_values', + 'causalnlp/meta/base.py'), + 'causalnlp.meta.base.BaseLearner.predict': ( 'meta.base.html#baselearner.predict', + 'causalnlp/meta/base.py')}, + 'causalnlp.meta.explainer': { 'causalnlp.meta.explainer.Explainer': ( 'meta.explainer.html#explainer', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.__init__': ( 'meta.explainer.html#explainer.__init__', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.build_new_tau_models': ( 'meta.explainer.html#explainer.build_new_tau_models', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.check_conditions': ( 'meta.explainer.html#explainer.check_conditions', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.create_feature_names': ( 'meta.explainer.html#explainer.create_feature_names', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.default_importance': ( 'meta.explainer.html#explainer.default_importance', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.get_importance': ( 'meta.explainer.html#explainer.get_importance', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.get_shap_values': ( 'meta.explainer.html#explainer.get_shap_values', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.perm_importance': ( 'meta.explainer.html#explainer.perm_importance', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.plot_importance': ( 'meta.explainer.html#explainer.plot_importance', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.plot_shap_dependence': ( 'meta.explainer.html#explainer.plot_shap_dependence', + 'causalnlp/meta/explainer.py'), + 'causalnlp.meta.explainer.Explainer.plot_shap_values': ( 'meta.explainer.html#explainer.plot_shap_values', + 'causalnlp/meta/explainer.py')}, + 'causalnlp.meta.propensity': { 'causalnlp.meta.propensity.ElasticNetPropensityModel': ( 'meta.propensity.html#elasticnetpropensitymodel', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.GradientBoostedPropensityModel': ( 'meta.propensity.html#gradientboostedpropensitymodel', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.GradientBoostedPropensityModel.__init__': ( 'meta.propensity.html#gradientboostedpropensitymodel.__init__', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.GradientBoostedPropensityModel._model': ( 'meta.propensity.html#gradientboostedpropensitymodel._model', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.GradientBoostedPropensityModel.fit': ( 'meta.propensity.html#gradientboostedpropensitymodel.fit', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.GradientBoostedPropensityModel.predict': ( 'meta.propensity.html#gradientboostedpropensitymodel.predict', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.LogisticRegressionPropensityModel': ( 'meta.propensity.html#logisticregressionpropensitymodel', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.LogisticRegressionPropensityModel._model': ( 'meta.propensity.html#logisticregressionpropensitymodel._model', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel': ( 'meta.propensity.html#propensitymodel', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel.__init__': ( 'meta.propensity.html#propensitymodel.__init__', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel.__repr__': ( 'meta.propensity.html#propensitymodel.__repr__', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel._model': ( 'meta.propensity.html#propensitymodel._model', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel.fit': ( 'meta.propensity.html#propensitymodel.fit', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel.fit_predict': ( 'meta.propensity.html#propensitymodel.fit_predict', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.PropensityModel.predict': ( 'meta.propensity.html#propensitymodel.predict', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.SimplePropensityModel': ( 'meta.propensity.html#simplepropensitymodel', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.SimplePropensityModel._model': ( 'meta.propensity.html#simplepropensitymodel._model', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.calibrate': ( 'meta.propensity.html#calibrate', + 'causalnlp/meta/propensity.py'), + 'causalnlp.meta.propensity.compute_propensity_score': ( 'meta.propensity.html#compute_propensity_score', + 'causalnlp/meta/propensity.py')}, + 'causalnlp.meta.rlearner': { 'causalnlp.meta.rlearner.BaseRClassifier': ( 'meta.rlearner.html#baserclassifier', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRClassifier.__init__': ( 'meta.rlearner.html#baserclassifier.__init__', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRClassifier.fit': ( 'meta.rlearner.html#baserclassifier.fit', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRClassifier.predict': ( 'meta.rlearner.html#baserclassifier.predict', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner': ( 'meta.rlearner.html#baserlearner', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.__init__': ( 'meta.rlearner.html#baserlearner.__init__', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.__repr__': ( 'meta.rlearner.html#baserlearner.__repr__', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.estimate_ate': ( 'meta.rlearner.html#baserlearner.estimate_ate', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.fit': ( 'meta.rlearner.html#baserlearner.fit', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.fit_predict': ( 'meta.rlearner.html#baserlearner.fit_predict', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRLearner.predict': ( 'meta.rlearner.html#baserlearner.predict', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRRegressor': ( 'meta.rlearner.html#baserregressor', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.BaseRRegressor.__init__': ( 'meta.rlearner.html#baserregressor.__init__', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.XGBRRegressor': ( 'meta.rlearner.html#xgbrregressor', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.XGBRRegressor.__init__': ( 'meta.rlearner.html#xgbrregressor.__init__', + 'causalnlp/meta/rlearner.py'), + 'causalnlp.meta.rlearner.XGBRRegressor.fit': ( 'meta.rlearner.html#xgbrregressor.fit', + 'causalnlp/meta/rlearner.py')}, + 'causalnlp.meta.sensitivity': { 'causalnlp.meta.sensitivity.Sensitivity': ( 'meta.sensitivity.html#sensitivity', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.__init__': ( 'meta.sensitivity.html#sensitivity.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.get_ate_ci': ( 'meta.sensitivity.html#sensitivity.get_ate_ci', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.get_class_object': ( 'meta.sensitivity.html#sensitivity.get_class_object', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.get_prediction': ( 'meta.sensitivity.html#sensitivity.get_prediction', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.sensitivity_analysis': ( 'meta.sensitivity.html#sensitivity.sensitivity_analysis', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.sensitivity_estimate': ( 'meta.sensitivity.html#sensitivity.sensitivity_estimate', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.Sensitivity.summary': ( 'meta.sensitivity.html#sensitivity.summary', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityPlaceboTreatment': ( 'meta.sensitivity.html#sensitivityplacebotreatment', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityPlaceboTreatment.__init__': ( 'meta.sensitivity.html#sensitivityplacebotreatment.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityPlaceboTreatment.sensitivity_estimate': ( 'meta.sensitivity.html#sensitivityplacebotreatment.sensitivity_estimate', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomCause': ( 'meta.sensitivity.html#sensitivityrandomcause', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomCause.__init__': ( 'meta.sensitivity.html#sensitivityrandomcause.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomCause.sensitivity_estimate': ( 'meta.sensitivity.html#sensitivityrandomcause.sensitivity_estimate', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomReplace': ( 'meta.sensitivity.html#sensitivityrandomreplace', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomReplace.__init__': ( 'meta.sensitivity.html#sensitivityrandomreplace.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivityRandomReplace.sensitivity_estimate': ( 'meta.sensitivity.html#sensitivityrandomreplace.sensitivity_estimate', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias': ( 'meta.sensitivity.html#sensitivityselectionbias', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias.__init__': ( 'meta.sensitivity.html#sensitivityselectionbias.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias.causalsens': ( 'meta.sensitivity.html#sensitivityselectionbias.causalsens', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias.partial_rsqs_confounding': ( 'meta.sensitivity.html#sensitivityselectionbias.partial_rsqs_confounding', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias.plot': ( 'meta.sensitivity.html#sensitivityselectionbias.plot', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySelectionBias.summary': ( 'meta.sensitivity.html#sensitivityselectionbias.summary', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySubsetData': ( 'meta.sensitivity.html#sensitivitysubsetdata', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySubsetData.__init__': ( 'meta.sensitivity.html#sensitivitysubsetdata.__init__', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.SensitivitySubsetData.sensitivity_estimate': ( 'meta.sensitivity.html#sensitivitysubsetdata.sensitivity_estimate', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.alignment': ( 'meta.sensitivity.html#alignment', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.alignment_att': ( 'meta.sensitivity.html#alignment_att', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.one_sided': ( 'meta.sensitivity.html#one_sided', + 'causalnlp/meta/sensitivity.py'), + 'causalnlp.meta.sensitivity.one_sided_att': ( 'meta.sensitivity.html#one_sided_att', + 'causalnlp/meta/sensitivity.py')}, + 'causalnlp.meta.slearner': { 'causalnlp.meta.slearner.BaseSClassifier': ( 'meta.slearner.html#basesclassifier', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSClassifier.__init__': ( 'meta.slearner.html#basesclassifier.__init__', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSClassifier.predict': ( 'meta.slearner.html#basesclassifier.predict', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner': ( 'meta.slearner.html#baseslearner', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.__init__': ( 'meta.slearner.html#baseslearner.__init__', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.__repr__': ( 'meta.slearner.html#baseslearner.__repr__', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.estimate_ate': ( 'meta.slearner.html#baseslearner.estimate_ate', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.fit': ( 'meta.slearner.html#baseslearner.fit', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.fit_predict': ( 'meta.slearner.html#baseslearner.fit_predict', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSLearner.predict': ( 'meta.slearner.html#baseslearner.predict', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSRegressor': ( 'meta.slearner.html#basesregressor', + 'causalnlp/meta/slearner.py'), + 'causalnlp.meta.slearner.BaseSRegressor.__init__': ( 'meta.slearner.html#basesregressor.__init__', + 'causalnlp/meta/slearner.py')}, + 'causalnlp.meta.tlearner': { 'causalnlp.meta.tlearner.BaseTClassifier': ( 'meta.tlearner.html#basetclassifier', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTClassifier.__init__': ( 'meta.tlearner.html#basetclassifier.__init__', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTClassifier.predict': ( 'meta.tlearner.html#basetclassifier.predict', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner': ( 'meta.tlearner.html#basetlearner', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.__init__': ( 'meta.tlearner.html#basetlearner.__init__', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.__repr__': ( 'meta.tlearner.html#basetlearner.__repr__', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.estimate_ate': ( 'meta.tlearner.html#basetlearner.estimate_ate', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.fit': ( 'meta.tlearner.html#basetlearner.fit', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.fit_predict': ( 'meta.tlearner.html#basetlearner.fit_predict', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTLearner.predict': ( 'meta.tlearner.html#basetlearner.predict', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTRegressor': ( 'meta.tlearner.html#basetregressor', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.BaseTRegressor.__init__': ( 'meta.tlearner.html#basetregressor.__init__', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.MLPTRegressor': ( 'meta.tlearner.html#mlptregressor', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.MLPTRegressor.__init__': ( 'meta.tlearner.html#mlptregressor.__init__', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.XGBTRegressor': ( 'meta.tlearner.html#xgbtregressor', + 'causalnlp/meta/tlearner.py'), + 'causalnlp.meta.tlearner.XGBTRegressor.__init__': ( 'meta.tlearner.html#xgbtregressor.__init__', + 'causalnlp/meta/tlearner.py')}, + 'causalnlp.meta.utils': { 'causalnlp.meta.utils.MatchOptimizer': ('meta.utils.html#matchoptimizer', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.MatchOptimizer.__init__': ( 'meta.utils.html#matchoptimizer.__init__', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.MatchOptimizer.check_table_one': ( 'meta.utils.html#matchoptimizer.check_table_one', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.MatchOptimizer.match_and_check': ( 'meta.utils.html#matchoptimizer.match_and_check', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.MatchOptimizer.search_best_match': ( 'meta.utils.html#matchoptimizer.search_best_match', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.MatchOptimizer.single_match': ( 'meta.utils.html#matchoptimizer.single_match', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.NearestNeighborMatch': ( 'meta.utils.html#nearestneighbormatch', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.NearestNeighborMatch.__init__': ( 'meta.utils.html#nearestneighbormatch.__init__', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.NearestNeighborMatch.match': ( 'meta.utils.html#nearestneighbormatch.match', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.NearestNeighborMatch.match_by_group': ( 'meta.utils.html#nearestneighbormatch.match_by_group', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.ape': ('meta.utils.html#ape', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.check_explain_conditions': ( 'meta.utils.html#check_explain_conditions', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.check_p_conditions': ( 'meta.utils.html#check_p_conditions', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.check_treatment_vector': ( 'meta.utils.html#check_treatment_vector', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.classification_metrics': ( 'meta.utils.html#classification_metrics', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.clean_xgboost_objective': ( 'meta.utils.html#clean_xgboost_objective', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.convert_pd_to_np': ( 'meta.utils.html#convert_pd_to_np', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.create_table_one': ( 'meta.utils.html#create_table_one', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.get_xgboost_objective_metric': ( 'meta.utils.html#get_xgboost_objective_metric', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.gini': ('meta.utils.html#gini', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.logloss': ('meta.utils.html#logloss', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.mape': ('meta.utils.html#mape', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.regression_metrics': ( 'meta.utils.html#regression_metrics', + 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.rmse': ('meta.utils.html#rmse', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.smape': ('meta.utils.html#smape', 'causalnlp/meta/utils.py'), + 'causalnlp.meta.utils.smd': ('meta.utils.html#smd', 'causalnlp/meta/utils.py')}, + 'causalnlp.meta.xlearner': { 'causalnlp.meta.xlearner.BaseXClassifier': ( 'meta.xlearner.html#basexclassifier', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXClassifier.__init__': ( 'meta.xlearner.html#basexclassifier.__init__', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXClassifier.fit': ( 'meta.xlearner.html#basexclassifier.fit', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXClassifier.predict': ( 'meta.xlearner.html#basexclassifier.predict', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner': ( 'meta.xlearner.html#basexlearner', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.__init__': ( 'meta.xlearner.html#basexlearner.__init__', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.__repr__': ( 'meta.xlearner.html#basexlearner.__repr__', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.estimate_ate': ( 'meta.xlearner.html#basexlearner.estimate_ate', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.fit': ( 'meta.xlearner.html#basexlearner.fit', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.fit_predict': ( 'meta.xlearner.html#basexlearner.fit_predict', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXLearner.predict': ( 'meta.xlearner.html#basexlearner.predict', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXRegressor': ( 'meta.xlearner.html#basexregressor', + 'causalnlp/meta/xlearner.py'), + 'causalnlp.meta.xlearner.BaseXRegressor.__init__': ( 'meta.xlearner.html#basexregressor.__init__', + 'causalnlp/meta/xlearner.py')}, + 'causalnlp.preprocessing': { 'causalnlp.preprocessing.DataframePreprocessor': ( 'preprocessing.html#dataframepreprocessor', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor.__init__': ( 'preprocessing.html#dataframepreprocessor.__init__', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor._check_binary': ( 'preprocessing.html#dataframepreprocessor._check_binary', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor._check_type': ( 'preprocessing.html#dataframepreprocessor._check_type', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor._get_feature_names': ( 'preprocessing.html#dataframepreprocessor._get_feature_names', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor._preprocess_column': ( 'preprocessing.html#dataframepreprocessor._preprocess_column', + 'causalnlp/preprocessing.py'), + 'causalnlp.preprocessing.DataframePreprocessor.preprocess': ( 'preprocessing.html#dataframepreprocessor.preprocess', + 'causalnlp/preprocessing.py')}}} diff --git a/causalnlp/analyzers.py b/causalnlp/analyzers.py index 287318e..747ae40 100644 --- a/causalnlp/analyzers.py +++ b/causalnlp/analyzers.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/02_analyzers.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_analyzers.ipynb. -__all__ = ['list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel', 'DEFAULT_TOKEN_PATTERN'] - -# Cell +# %% auto 0 +__all__ = ['DEFAULT_TOKEN_PATTERN', 'list2chunks', 'ZeroShotClassifier', 'TextEncoder', 'TopicModel'] +# %% ../nbs/02_analyzers.ipynb 4 import math import warnings import numpy as np @@ -13,8 +13,7 @@ def list2chunks(a, n): k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) -# Cell - +# %% ../nbs/02_analyzers.ipynb 5 class ZeroShotClassifier(): """ Interface to Zero Shot Topic Classifier @@ -61,22 +60,22 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True, if len(topic_strings) is large. - nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference - topic_strings(list): alias for labels parameter for backwards compatibility - + **Returns:** - - + + inferred probabilities or list of inferred probabilities if doc is list """ # error checks is_str_input = False - if not isinstance(docs, (list, np.ndarray)): + if not isinstance(docs, (list, np.ndarray)): docs = [docs] is_str_input = True if not isinstance(docs[0], str): raise ValueError('docs must be string or a list of strings representing document(s)') if len(labels) > 0 and len(topic_strings) > 0: raise ValueError('labels and topic_strings are mutually exclusive') if not labels and not topic_strings: raise ValueError('labels must be a list of strings') - if topic_strings: + if topic_strings: labels = topic_strings @@ -117,8 +116,7 @@ def predict(self, docs, labels=[], include_labels=False, multilabel=True, if is_str_input: scores = scores[0] return scores -# Cell - +# %% ../nbs/02_analyzers.ipynb 10 #from sentence_transformers import SentenceTransformer, util class TextEncoder(): @@ -146,7 +144,7 @@ def __init__(self, model_name='stsb-roberta-large', device=None): self.torch_device = device if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model = SentenceTransformer(model_name) - + def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False): """Generate embedding for supplied text""" if isinstance(texts, str): texts = [texts] @@ -154,8 +152,7 @@ def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False) show_progress_bar=show_progress_bar, normalize_embeddings=normalize, convert_to_tensor=False, device=self.torch_device ) -# Cell - +# %% ../nbs/02_analyzers.ipynb 16 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation import math @@ -166,7 +163,7 @@ def encode(self, texts, batch_size=32, normalize=False, show_progress_bar=False) class TopicModel(): - def __init__(self,texts=None, n_topics=None, n_features=10000, + def __init__(self,texts=None, n_topics=None, n_features=10000, min_df=5, max_df=0.5, stop_words='english', model_type='lda', lda_max_iter=5, lda_mode='online', @@ -176,7 +173,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000, """ Fits a topic model to documents in . Example: - tm = ktrain.text.get_topic_model(docs, n_topics=20, + tm = ktrain.text.get_topic_model(docs, n_topics=20, n_features=1000, min_df=2, max_df=0.95) Args: texts (list of str): list of texts @@ -191,7 +188,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000, If lda_mode='batch', this should be increased (e.g., 1500). Ignored if model_type != 'lda' lda_mode (str): one of {'online', 'batch'}. Ignored if model_type !='lda' - token_pattern(str): regex pattern to use to tokenize documents. + token_pattern(str): regex pattern to use to tokenize documents. verbose(bool): verbosity """ self.verbose=verbose @@ -208,7 +205,7 @@ def __init__(self,texts=None, n_topics=None, n_features=10000, if texts is not None: (model, vectorizer) = self.train(texts, model_type=model_type, n_topics=n_topics, n_features=n_features, - min_df = min_df, max_df = max_df, + min_df = min_df, max_df = max_df, stop_words=stop_words, lda_max_iter=lda_max_iter, lda_mode=lda_mode, token_pattern=token_pattern, @@ -258,7 +255,7 @@ def train(self,texts, model_type='lda', n_topics=None, n_features=10000, vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=n_features, stop_words=stop_words, token_pattern=token_pattern, ngram_range=ngram_range) - + x_train = vectorizer.fit_transform(texts) @@ -321,7 +318,7 @@ def get_word_weights(self, topic_id, n_words=100): Returns a list tuples of the form: (word, weight) for given topic_id. """ self._check_model() - if topic_id+1 > len(self.model.components_): + if topic_id+1 > len(self.model.components_): raise ValueError('topic_id must be less than %s' % (len(self.model.components_))) feature_names = self.vectorizer.get_feature_names() word_probs = self.model.components_[topic_id] @@ -352,7 +349,7 @@ def print_topics(self, n_words=10, show_counts=False): topics = self.get_topics(n_words=n_words, as_string=True) if show_counts: self._check_build() - topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()], + topic_counts = sorted([ (k, topics[k], len(v)) for k,v in self.topic_dict.items()], key=lambda kv:kv[-1], reverse=True) for (idx, topic, count) in topic_counts: print("topic:%s | count:%s | %s" %(idx, count, topic)) @@ -371,12 +368,12 @@ def build(self, texts): self.topic_dict = self._rank_documents(texts, doc_topics=doc_topics) return + - - + def get_docs(self, topic_ids=[], doc_ids=[], rank=False): """ - Returns document entries for supplied topic_ids. + Returns document entries for supplied topic_ids. """ self._check_build() if not topic_ids: @@ -384,7 +381,7 @@ def get_docs(self, topic_ids=[], doc_ids=[], rank=False): result_texts = [] for topic_id in topic_ids: if topic_id not in self.topic_dict: continue - texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id] + texts = [{'text':tup[0], 'doc_id':tup[1], 'topic_proba':tup[2], 'topic_id':topic_id} for tup in self.topic_dict[topic_id] if not doc_ids or tup[1] in doc_ids] result_texts.extend(texts) if not rank: @@ -439,10 +436,10 @@ def _rank_documents(self, result_dict[topic_id] = lst return result_dict - + def _check_build(self): self._check_model() - if self.topic_dict is None: + if self.topic_dict is None: raise Exception('Must call build() method.') diff --git a/causalnlp/autocoder.py b/causalnlp/autocoder.py index e018c29..0948c15 100644 --- a/causalnlp/autocoder.py +++ b/causalnlp/autocoder.py @@ -1,8 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/01_autocoder.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_autocoder.ipynb. +# %% auto 0 __all__ = ['Autocoder'] -# Cell +# %% ../nbs/01_autocoder.ipynb 4 import numpy as np import pandas as pd pd.set_option('display.max_columns', 500) @@ -31,9 +32,9 @@ def _format_to_df(self, results, df): lst = d.get(label, []) lst.append(prob) d[label] = lst - new_df = df.join(pd.DataFrame(d, index=df.index)) + new_df = df.join(pd.DataFrame(d, index=df.index)) return new_df - + def _binarize_df(self, df, colnames, threshold=0.5): """ Binarizes each column in `colnames` based on threshold. @@ -41,14 +42,14 @@ def _binarize_df(self, df, colnames, threshold=0.5): for col in colnames: df[col] = (df[col] >= threshold).astype(int) return df - + def _check_columns(self, labels, df): """check columns""" cols = df.columns.values for l in labels: if l in cols: raise ValueError('There is already a column named %s in your DataFrame.' % (l)) - + def code_sentiment(self, docs, df, batch_size=8, binarize=False, threshold=0.5): """ @@ -60,10 +61,10 @@ def code_sentiment(self, docs, df, batch_size=8, binarize=False, threshold=0.5): results = self.zsl.predict(docs, labels=labels, include_labels=True, multilabel=False, batch_size=batch_size, nli_template="The sentiment of this movie review is {}.") - df= self._format_to_df(results, df) + df= self._format_to_df(results, df) if binarize: df = self._binarize_df(df, labels, threshold=threshold) return df - + def code_emotion(self, docs, df, batch_size=8, binarize=False, threshold=0.5): """ Autocodes text for emotion @@ -74,22 +75,22 @@ def code_emotion(self, docs, df, batch_size=8, binarize=False, threshold=0.5): results = self.zsl.predict(docs, labels=labels, include_labels=True, multilabel=False, batch_size=batch_size, nli_template="The emotion of this text is {}.") - df= self._format_to_df(results, df) + df= self._format_to_df(results, df) if binarize: df = self._binarize_df(df, labels, threshold=threshold) - return df - + return df + def code_custom_topics(self, docs, df, labels, batch_size=8, binarize=False, threshold=0.5): """ Autocodes text for user-specified topics. The `label` field is the name of the topic as a string (or a list of them.) """ self._check_columns(labels, df) - + results = self.zsl.predict(docs, labels=labels, include_labels=True, batch_size=8) - df = self._format_to_df(results, df) + df = self._format_to_df(results, df) if binarize: df = self._binarize_df(df, labels, threshold=threshold) return df - + def code_lda_topics(self, docs, df, k=10, n_features=10000): """ Encode texts as semantically meaningful vectors using Latent Dirichlet Alocation @@ -104,8 +105,8 @@ def code_lda_topics(self, docs, df, k=10, n_features=10000): vals = [v for v in data] results.append( list(zip(keys, vals)) ) df = self._format_to_df(results, df) - return df - + return df + def code_callable(self, docs, df, fn): """ Autocodes text for any user-specified function @@ -113,13 +114,13 @@ def code_callable(self, docs, df, fn): text in `docs` where the keys are desired column names and values are scores or probabilities. """ - + results = [] for doc in docs: results.append(fn(doc)) - df = self._format_to_df(results, df) + df = self._format_to_df(results, df) return df - def code_transformer(self, docs, df, batch_size=32, + def code_transformer(self, docs, df, batch_size=32, model_name='stsb-roberta-large', show_progress_bar=False): """ Encode texts as semantically meaningful vectors using a Transformer model @@ -133,4 +134,4 @@ def code_transformer(self, docs, df, batch_size=32, vals = [v for v in data] results.append( list(zip(keys, vals)) ) df = self._format_to_df(results, df) - return df \ No newline at end of file + return df diff --git a/causalnlp/core/causalbert.py b/causalnlp/core/causalbert.py index 19e0c6e..028b516 100644 --- a/causalnlp/core/causalbert.py +++ b/causalnlp/core/causalbert.py @@ -1,11 +1,11 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/00b_core.causalbert.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/00b_core.causalbert.ipynb. -__all__ = ['platt_scale', 'gelu', 'make_bow_vector', 'CausalBert', 'CausalBertModel', 'CUDA', 'MASK_IDX'] +# %% auto 0 +__all__ = ['CUDA', 'MASK_IDX', 'platt_scale', 'gelu', 'make_bow_vector', 'CausalBert', 'CausalBertModel'] -# Cell - -# An extensible implementation of the Causal Bert model from -# "Adapting Text Embeddings for Causal Inference" +# %% ../../nbs/00b_core.causalbert.ipynb 4 +# An extensible implementation of the Causal Bert model from +# "Adapting Text Embeddings for Causal Inference" # (https://arxiv.org/abs/1905.12741) #This implementation has been adapted from this GitHub repository: # https://github.com/rpryzant/causal-bert-pytorch @@ -100,7 +100,7 @@ def __init__(self, config): nn.ReLU(), nn.Linear(200, self.num_labels)) - self.g_cls = nn.Linear(config.hidden_size + self.num_labels, + self.g_cls = nn.Linear(config.hidden_size + self.num_labels, self.config.num_labels) self.init_weights() @@ -143,9 +143,9 @@ def forward(self, W_ids, W_len, W_mask, C, T, Y=None, use_mlm=True): else: g_loss = 0.0 - # conditional expected outcome logits: + # conditional expected outcome logits: # run each example through its corresponding T matrix - # TODO this would be cleaner with sigmoid and BCELoss, but less general + # TODO this would be cleaner with sigmoid and BCELoss, but less general # (and I couldn't get it to work as well) Q_logits_T0 = self.Q_cls['0'](inputs) Q_logits_T1 = self.Q_cls['1'](inputs) @@ -186,7 +186,7 @@ def __init__(self, g_weight=0.0, Q_weight=0.1, mlm_weight=1.0, The resultant model can be used to estimate treatment effects for observations. """ - if 'distilbert' not in model_name: + if 'distilbert' not in model_name: raise ValueError('CausalBert currently only supports DistilBERT models') self.model = CausalBert.from_pretrained( model_name, @@ -224,7 +224,7 @@ def train(self, texts, confounds, treatments, outcomes, losses = [] self.model.train() for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)): - if CUDA: + if CUDA: batch = (x.cuda() for x in batch) W_ids, W_len, W_mask, C, T, Y = batch # while True: @@ -253,7 +253,7 @@ def inference(self, texts, confounds, outcome=None): Q1s = [] Ys = [] for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)): - if CUDA: + if CUDA: batch = (x.cuda() for x in batch) W_ids, W_len, W_mask, C, T, Y = batch g, Q0, Q1, _, _, _ = self.model(W_ids, W_len, W_mask, C, T, use_mlm=False) diff --git a/causalnlp/core/causalinference.py b/causalnlp/core/causalinference.py index d294e26..d51a74e 100644 --- a/causalnlp/core/causalinference.py +++ b/causalnlp/core/causalinference.py @@ -1,8 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/00a_core.causalinference.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/00a_core.causalinference.ipynb. -__all__ = ['CausalInferenceModel', 'metalearner_cls_dict', 'metalearner_reg_dict'] +# %% auto 0 +__all__ = ['metalearner_cls_dict', 'metalearner_reg_dict', 'CausalInferenceModel'] -# Cell +# %% ../../nbs/00a_core.causalinference.ipynb 4 import pandas as pd pd.set_option('display.max_columns', 500) import time @@ -35,30 +36,30 @@ class CausalInferenceModel: """Infers causality from the data contained in `df` using a metalearner. - - + + Usage: ```python - >>> cm = CausalInferenceModel(df, - treatment_col='Is_Male?', + >>> cm = CausalInferenceModel(df, + treatment_col='Is_Male?', outcome_col='Post_Shared?', text_col='Post_Text', ignore_cols=['id', 'email']) cm.fit() ``` - + **Parameters:** - + * **df** : pandas.DataFrame containing dataset * **method** : metalearner model to use. One of {'t-learner', 's-learner', 'x-learner', 'r-learner'} (Default: 't-learner') * **metalearner_type** : Alias of `method` for backwards compatibility. Overrides `method` if not None. * **treatment_col** : treatment variable; column should contain binary values: 1 for treated, 0 for untreated. * **outcome_col** : outcome variable; column should contain the categorical or numeric outcome values - * **text_col** : (optional) text column containing the strings (e.g., articles, reviews, emails). + * **text_col** : (optional) text column containing the strings (e.g., articles, reviews, emails). * **ignore_cols** : columns to ignore in the analysis * **include_cols** : columns to include as covariates (e.g., possible confounders) * **treatment_effect_col** : name of column to hold causal effect estimations. Does not need to exist. Created by CausalNLP. - * **learner** : an instance of a custom learner. If None, Log/Lin Regression is used for S-Learner + * **learner** : an instance of a custom learner. If None, Log/Lin Regression is used for S-Learner and a default LightGBM model will be used for all other metalearner types. # Example learner = LGBMClassifier(num_leaves=1000) @@ -69,12 +70,12 @@ class CausalInferenceModel: * **stop_words** : stop words used for text processing (from sklearn) * **verbose** : If 1, print informational messages. If 0, suppress. """ - def __init__(self, - df, + def __init__(self, + df, method='t-learner', metalearner_type=None, # alias for method - treatment_col='treatment', - outcome_col='outcome', + treatment_col='treatment', + outcome_col='outcome', text_col=None, ignore_cols=[], include_cols=[], @@ -90,8 +91,8 @@ def __init__(self, constructor """ # for backwards compatibility - if metalearner_type is not None: - if method != 't-learner': + if metalearner_type is not None: + if method != 't-learner': warnings.warn(f'metalearner_type and method are mutually exclusive. '+\ f'Used {metalearner_type} as method.') method = metalearner_type @@ -104,13 +105,13 @@ def __init__(self, self.v = verbose self.df = df.copy() self.ps = None # computed by _create_metalearner, if necessary - - + + # these are auto-populated by preprocess method self.x = None self.y = None self.treatment = None - + # preprocess self.pp = DataframePreprocessor(treatment_col = treatment_col, outcome_col = outcome_col, @@ -130,9 +131,9 @@ def __init__(self, supplied_learner=learner, supplied_effect_learner=effect_learner) + - - def _create_metalearner(self, method='t-learner', + def _create_metalearner(self, method='t-learner', supplied_learner=None, supplied_effect_learner=None): ## use LRSRegressor for s-learner regression as default instead of tree-based model #if method =='s-learner' and supplied_learner is None: return LRSRegressor() @@ -147,7 +148,7 @@ def _create_metalearner(self, method='t-learner', learner = default_learner if supplied_learner is None else supplied_learner effect_learner = default_effect_learner if supplied_effect_learner is None else\ supplied_effect_learner - + # set metalearner metalearner_class = metalearner_cls_dict[method] if self.pp.is_classification \ else metalearner_reg_dict[method] @@ -159,15 +160,15 @@ def _create_metalearner(self, method='t-learner', treatment_outcome_learner=deepcopy(learner), control_effect_learner=deepcopy(effect_learner), treatment_effect_learner=deepcopy(effect_learner), - control_name=0) + control_name=0) else: model = metalearner_class(outcome_learner=deepcopy(learner), effect_learner=deepcopy(effect_learner), - control_name=0) + control_name=0) return model - - + + def fit(self, p=None): """ Fits a causal inference model and estimates outcome @@ -183,7 +184,7 @@ def fit(self, p=None): self.df[self.te] = preds print("time to fit causal inference model: ",-start_time + time.time()," sec") return self - + def predict(self, df, p=None): """ Estimates the treatment effect for each observation in `df`. @@ -192,10 +193,10 @@ def predict(self, df, p=None): For X-Learner and R-Learner, propensity scores will be computed using default propensity model unless `p` is not None. Parameter `p` is not used for other methods. - """ + """ _, x, _, _ = self.pp.preprocess(df, training=False) return self._predict(x, p=p) - + def _predict(self, x, p=None): """ @@ -206,7 +207,7 @@ def _predict(self, x, p=None): return self.model.predict(x.values, p=p) else: return self.model.predict(x, p=p) - + def estimate_ate(self, bool_mask=None): """ Estimates the treatment effect for each observation in @@ -216,7 +217,7 @@ def estimate_ate(self, bool_mask=None): a = df[self.te].values mean = np.mean(a) return {'ate' : mean} - + def interpret(self, plot=False, method='feature_importance'): """ @@ -240,17 +241,17 @@ def interpret(self, plot=False, method='feature_importance'): else: raise ValueError('Unknown method: %s' % method) return fn(X=self.x, tau=tau, features = feature_names) - + def compute_propensity_scores(self, x_pred=None): """ Computes and returns propensity scores for `CausalInferenceModel.treatment` in addition to the Propensity model. """ - from ..meta import propensity + from causalnlp.meta import propensity return propensity.compute_propensity_score(self.x, self.treatment, X_pred=x_pred) - - + + def _balance(self, caliper = None, n_fold=3, overwrite=False): """ Balances dataset to minimize bias. Currently uses propensity score matching. @@ -304,16 +305,16 @@ def _balance(self, caliper = None, n_fold=3, overwrite=False): 'with original dataset.') else: print('\nBalanced data is available as variables: x_matched, y_matched, treatment_matched, df_matched') - return - + return + def _predict_shap(self, x): return self._predict(x) - + def explain(self, df, row_index=None, row_num=0, background_size=50, nsamples=500): """ Explain the treatment effect estimate of a single observation using SHAP. - - + + **Parameters:** - **df** (pd.DataFrame): a pd.DataFrame of test data is same format as original training data DataFrame - **row_num** (int): raw row number in DataFrame to explain (default:0, the first row) @@ -336,7 +337,7 @@ def explain(self, df, row_index=None, row_num=0, background_size=50, nsamples=50 # select row df_display_row = df_display.iloc[[row_num]] r_key = 'row_num' - r_val = row_num + r_val = row_num # shap explainer = shap.KernelExplainer(f, self.x.iloc[:background_size,:]) @@ -349,7 +350,7 @@ def explain(self, df, row_index=None, row_num=0, background_size=50, nsamples=50 shap_values = shap_values[0] plt.show(shap.force_plot(expected_value, shap_values, df_display_row, matplotlib=True)) - + def get_required_columns(self): """ Returns required columns that must exist in any DataFrame supplied to `CausalInferenceModel.predict`. @@ -370,24 +371,24 @@ def tune_and_use_default_learner(self, split_pct=0.2, random_state=314, scoring= is used for regresssion. """ from sklearn.model_selection import train_test_split - X_train, X_test, y_train, y_test = train_test_split(self.x.values, self.y.values, - test_size=split_pct, + X_train, X_test, y_train, y_test = train_test_split(self.x.values, self.y.values, + test_size=split_pct, random_state=random_state) - - fit_params={"early_stopping_rounds":30, - "eval_metric" : 'auc' if self.pp.is_classification else 'rmse', + + fit_params={"early_stopping_rounds":30, + "eval_metric" : 'auc' if self.pp.is_classification else 'rmse', "eval_set" : [(X_test,y_test)], 'eval_names': ['valid'], 'verbose': 100, - 'categorical_feature': 'auto'} - + 'categorical_feature': 'auto'} + from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform - param_test ={'num_leaves': sp_randint(6, 750), - 'min_child_samples': sp_randint(20, 500), + param_test ={'num_leaves': sp_randint(6, 750), + 'min_child_samples': sp_randint(20, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], - 'subsample': sp_uniform(loc=0.2, scale=0.8), + 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]} @@ -398,11 +399,11 @@ def tune_and_use_default_learner(self, split_pct=0.2, random_state=314, scoring= else: learner_type = LGBMRegressor scoring = 'neg_mean_squared_error' if scoring is None else scoring - clf = learner_type(max_depth=-1, random_state=random_state, silent=True, + clf = learner_type(max_depth=-1, random_state=random_state, silent=True, metric='None', n_jobs=4, n_estimators=5000) from sklearn.model_selection import RandomizedSearchCV, GridSearchCV gs = RandomizedSearchCV( - estimator=clf, param_distributions=param_test, + estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring=scoring, cv=3, @@ -415,7 +416,7 @@ def tune_and_use_default_learner(self, split_pct=0.2, random_state=314, scoring= best_params = gs.best_params_ self.learner = learner_type(**best_params) return best_params - + def evaluate_robustness(self, sample_size=0.8): """ Evaluates robustness on four sensitivity measures (see CausalML package for details on these methods): @@ -424,25 +425,25 @@ def evaluate_robustness(self, sample_size=0.8): - **Random Replacement**: ATE should not change. - **Subset Data**: ATE should not change. """ - from ..meta.sensitivity import Sensitivity + from causalnlp.meta.sensitivity import Sensitivity data_df = self.x.copy() t_col = 'CausalNLP_t' y_col = 'CausalNLP_y' data_df[t_col] = self.treatment data_df[y_col] = self.y - sens_x = Sensitivity(df=data_df, + sens_x = Sensitivity(df=data_df, inference_features=self.x.columns.values, p_col=None, - treatment_col=t_col, outcome_col=y_col, + treatment_col=t_col, outcome_col=y_col, learner=self.model) df = sens_x.sensitivity_analysis(methods=['Placebo Treatment', 'Random Cause', 'Subset Data', 'Random Replace', ],sample_size=sample_size) - df['Distance from Desired (should be near 0)'] = np.where(df['Method']=='Placebo Treatment', - df['New ATE']-0.0, + df['Distance from Desired (should be near 0)'] = np.where(df['Method']=='Placebo Treatment', + df['New ATE']-0.0, df['New ATE']-df['ATE']) - + #df['Method'] = np.where(df['Method']=='Random Cause', 'Random Add', df['Method']) - return df \ No newline at end of file + return df diff --git a/causalnlp/key_driver_analysis.py b/causalnlp/key_driver_analysis.py index f890af9..4249d20 100644 --- a/causalnlp/key_driver_analysis.py +++ b/causalnlp/key_driver_analysis.py @@ -1,8 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/03_key_driver_analysis.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/03_key_driver_analysis.ipynb. +# %% auto 0 __all__ = ['KeyDriverAnalysis'] -# Cell +# %% ../nbs/03_key_driver_analysis.ipynb 5 import numpy as np import pandas as pd pd.set_option('display.max_columns', 500) @@ -27,9 +28,9 @@ def __init__(self, df, outcome_col='outcome', text_col=None, include_cols=[], ig """ self.v = verbose self.pp = None # set with call to _preprocess - self.df, self.x, self.y = self._preprocess(df, outcome_col=outcome_col, text_col=text_col, + self.df, self.x, self.y = self._preprocess(df, outcome_col=outcome_col, text_col=text_col, include_cols=include_cols, ignore_cols=ignore_cols) - + def _preprocess(self, df, outcome_col='outcome', text_col=None, include_cols=[], ignore_cols=[]): """ @@ -38,7 +39,7 @@ def _preprocess(self, df, outcome_col='outcome', text_col=None, include_cols=[], temp_treatment = 'CausalNLP_temp_treatment' df = df.copy() df[temp_treatment] = [0] * df.shape[0] - + # preprocess self.pp = DataframePreprocessor(treatment_col = temp_treatment, outcome_col = outcome_col, @@ -71,17 +72,17 @@ def correlations(self, outcome_only=True): else: return corrALL - - - def importances(self, plot=True, split_pct=0.2, + + + def importances(self, plot=True, split_pct=0.2, use_shap=False, shap_background_size=50, rf_model=None, n_estimators=100, n_jobs=-1, random_state=42): """ Identifies important predictors using a RandomForest model. """ - - X_train, X_test, y_train, y_test = train_test_split(self.x.values, self.y.values, - test_size=split_pct, + + X_train, X_test, y_train, y_test = train_test_split(self.x.values, self.y.values, + test_size=split_pct, random_state=random_state) rf_type = RandomForestClassifier if self.pp.is_classification else RandomForestRegressor rf = rf_type(n_estimators = n_estimators, @@ -92,7 +93,7 @@ def importances(self, plot=True, split_pct=0.2, rf.fit(X_train, y_train) if self.v: print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format( - rf.score(X_train, y_train), + rf.score(X_train, y_train), rf.oob_score_, rf.score(X_test, y_test))) driverNames = self.x.columns.values @@ -120,4 +121,4 @@ def importances(self, plot=True, split_pct=0.2, if plot: feat_importances = pd.Series(rf.feature_importances_, index=driverNames) feat_importances.nlargest(20).plot(kind='barh') - return df_results \ No newline at end of file + return df_results diff --git a/causalnlp/meta/base.py b/causalnlp/meta/base.py index c9aa602..34a0adb 100644 --- a/causalnlp/meta/base.py +++ b/causalnlp/meta/base.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05a_meta.base.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05a_meta.base.ipynb. -__all__ = ['BaseLearner', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'BaseLearner'] +# %% ../../nbs/05a_meta.base.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -256,4 +256,4 @@ def plot_shap_dependence(self, treatment_group, feature_idx, X, tau, model_tau_f feature_idx=feature_idx, shap_dict=shap_dict, interaction_idx=interaction_idx, - **kwargs) \ No newline at end of file + **kwargs) diff --git a/causalnlp/meta/explainer.py b/causalnlp/meta/explainer.py index 95a3e2c..742ea6f 100644 --- a/causalnlp/meta/explainer.py +++ b/causalnlp/meta/explainer.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05g_meta.explainer.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05g_meta.explainer.ipynb. -__all__ = ['Explainer', 'VALID_METHODS'] - -# Cell +# %% auto 0 +__all__ = ['VALID_METHODS', 'Explainer'] +# %% ../../nbs/05g_meta.explainer.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -182,7 +182,7 @@ def get_shap_values(self): except ImportError: raise ImportError('Please install shap (conda is recommended): '+\ 'conda install shap --channel conda-forge') - + shap_dict = {} for group, mod in self.models_tau.items(): explainer = shap.TreeExplainer(mod) @@ -218,7 +218,7 @@ def plot_shap_values(self, shap_dict=None): except ImportError: raise ImportError('Please install shap (conda is recommended): '+\ 'conda install shap --channel conda-forge') - + if shap_dict is None: shap_dict = self.get_shap_values() @@ -256,4 +256,4 @@ def plot_shap_dependence(self, treatment_group, feature_idx, shap_dict=None, int shap_values = shap_dict[treatment_group] shap.dependence_plot(feature_idx, shap_values, self.X, interaction_index=interaction_idx, - feature_names=self.features, **kwargs) \ No newline at end of file + feature_names=self.features, **kwargs) diff --git a/causalnlp/meta/propensity.py b/causalnlp/meta/propensity.py index 81c2c6c..b828684 100644 --- a/causalnlp/meta/propensity.py +++ b/causalnlp/meta/propensity.py @@ -1,10 +1,10 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05h_meta.propensity.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05h_meta.propensity.ipynb. -__all__ = ['PropensityModel', 'LogisticRegressionPropensityModel', 'SimplePropensityModel', 'ElasticNetPropensityModel', - 'GradientBoostedPropensityModel', 'calibrate', 'compute_propensity_score', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'PropensityModel', 'LogisticRegressionPropensityModel', 'SimplePropensityModel', 'ElasticNetPropensityModel', + 'GradientBoostedPropensityModel', 'calibrate', 'compute_propensity_score'] +# %% ../../nbs/05h_meta.propensity.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -117,7 +117,7 @@ def _model(self): kwargs.update(self.model_kwargs) return LogisticRegressionCV(**kwargs) - + class SimplePropensityModel(PropensityModel): """ Propensity regression model based on the LogisticRegression algorithm. @@ -271,4 +271,4 @@ def compute_propensity_score(X, treatment, p_model=None, X_pred=None, treatment_ p = np.where(p < 0 + eps, 0 + eps*1.001, p) p = np.where(p > 1 - eps, 1 - eps*1.001, p) - return p, p_model \ No newline at end of file + return p, p_model diff --git a/causalnlp/meta/rlearner.py b/causalnlp/meta/rlearner.py index 8b012e7..7d41bdc 100644 --- a/causalnlp/meta/rlearner.py +++ b/causalnlp/meta/rlearner.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05e_meta.rlearner.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05e_meta.rlearner.ipynb. -__all__ = ['BaseRLearner', 'BaseRRegressor', 'BaseRClassifier', 'XGBRRegressor', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'BaseRLearner', 'BaseRRegressor', 'BaseRClassifier', 'XGBRRegressor'] +# %% ../../nbs/05e_meta.rlearner.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -535,4 +535,4 @@ def fit(self, X, treatment, y, p=None, verbose=True): eval_metric=self.effect_learner_eval_metric) self.vars_c[group] = (y_filt[w == 0] - yhat_filt[w == 0]).var() - self.vars_t[group] = (y_filt[w == 1] - yhat_filt[w == 1]).var() \ No newline at end of file + self.vars_t[group] = (y_filt[w == 1] - yhat_filt[w == 1]).var() diff --git a/causalnlp/meta/sensitivity.py b/causalnlp/meta/sensitivity.py index 5195af0..0cfe099 100644 --- a/causalnlp/meta/sensitivity.py +++ b/causalnlp/meta/sensitivity.py @@ -1,11 +1,10 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05i_meta.sensitivity.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05i_meta.sensitivity.ipynb. -__all__ = ['one_sided', 'alignment', 'one_sided_att', 'alignment_att', 'Sensitivity', 'SensitivityPlaceboTreatment', - 'SensitivityRandomCause', 'SensitivityRandomReplace', 'SensitivitySubsetData', 'SensitivitySelectionBias', - 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'one_sided', 'alignment', 'one_sided_att', 'alignment_att', 'Sensitivity', 'SensitivityPlaceboTreatment', + 'SensitivityRandomCause', 'SensitivityRandomReplace', 'SensitivitySubsetData', 'SensitivitySelectionBias'] +# %% ../../nbs/05i_meta.sensitivity.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -158,7 +157,7 @@ def get_ate_ci(self, X, p, treatment, y): """ learner = self.learner - from .tlearner import BaseTLearner + from causalnlp.meta.tlearner import BaseTLearner if isinstance(learner, BaseTLearner): ate, ate_lower, ate_upper = learner.estimate_ate(X=X, treatment=treatment, y=y) else: diff --git a/causalnlp/meta/slearner.py b/causalnlp/meta/slearner.py index 317b1ad..979860e 100644 --- a/causalnlp/meta/slearner.py +++ b/causalnlp/meta/slearner.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05c_meta.slearner.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05c_meta.slearner.ipynb. -__all__ = ['BaseSLearner', 'BaseSRegressor', 'BaseSClassifier', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'BaseSLearner', 'BaseSRegressor', 'BaseSClassifier'] +# %% ../../nbs/05c_meta.slearner.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. diff --git a/causalnlp/meta/tlearner.py b/causalnlp/meta/tlearner.py index ea6286a..576c191 100644 --- a/causalnlp/meta/tlearner.py +++ b/causalnlp/meta/tlearner.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05b_meta.tlearner.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05b_meta.tlearner.ipynb. -__all__ = ['BaseTLearner', 'BaseTRegressor', 'BaseTClassifier', 'XGBTRegressor', 'MLPTRegressor', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'BaseTLearner', 'BaseTRegressor', 'BaseTClassifier', 'XGBTRegressor', 'MLPTRegressor'] +# %% ../../nbs/05b_meta.tlearner.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -384,4 +384,4 @@ def __init__(self, ate_alpha=.05, control_name=0, *args, **kwargs): """Initialize a T-learner with two MLP models.""" super().__init__(learner=MLPRegressor(*args, **kwargs), ate_alpha=ate_alpha, - control_name=control_name) \ No newline at end of file + control_name=control_name) diff --git a/causalnlp/meta/utils.py b/causalnlp/meta/utils.py index 548a223..6fb4059 100644 --- a/causalnlp/meta/utils.py +++ b/causalnlp/meta/utils.py @@ -1,12 +1,12 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05f_meta.utils.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05f_meta.utils.ipynb. -__all__ = ['convert_pd_to_np', 'check_treatment_vector', 'check_p_conditions', 'check_explain_conditions', - 'clean_xgboost_objective', 'get_xgboost_objective_metric', 'EPS', 'ape', 'mape', 'smape', 'rmse', 'gini', - 'regression_metrics', 'logger', 'logloss', 'classification_metrics', 'logger', 'smd', 'create_table_one', - 'NearestNeighborMatch', 'MatchOptimizer', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['EPS', 'logger', 'convert_pd_to_np', 'check_treatment_vector', 'check_p_conditions', 'check_explain_conditions', + 'clean_xgboost_objective', 'get_xgboost_objective_metric', 'ape', 'mape', 'smape', 'rmse', 'gini', + 'regression_metrics', 'logloss', 'classification_metrics', 'smd', 'create_table_one', 'NearestNeighborMatch', + 'MatchOptimizer'] +# %% ../../nbs/05f_meta.utils.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -126,11 +126,10 @@ def clean_dict_keys(orig): 'Effect learner objective must be one of: ' + ", ".join(metric_mapping) return objective, metric_mapping[objective] -# Cell +# %% ../../nbs/05f_meta.utils.ipynb 5 EPS = 1e-15 -# Cell - +# %% ../../nbs/05f_meta.utils.ipynb 6 import logging import numpy as np from sklearn.metrics import mean_squared_error as mse @@ -255,7 +254,7 @@ def regression_metrics(y, p, w=None, metrics={'RMSE': rmse, 'sMAPE': smape, 'Gin else: logger.info('{:>8s}: {:10.4f}'.format(name, func(y, p))) -# Cell +# %% ../../nbs/05f_meta.utils.ipynb 7 import logging from sklearn.metrics import log_loss, roc_auc_score @@ -288,9 +287,7 @@ def classification_metrics(y, p, w=None, metrics={'AUC': roc_auc_score, 'Log Los """ regression_metrics(y=y, p=p, w=w, metrics=metrics) - -# Cell - +# %% ../../nbs/05f_meta.utils.ipynb 8 import argparse import logging import sys diff --git a/causalnlp/meta/xlearner.py b/causalnlp/meta/xlearner.py index 35002fb..c8ab6a6 100644 --- a/causalnlp/meta/xlearner.py +++ b/causalnlp/meta/xlearner.py @@ -1,9 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/05d_meta.xlearner.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/05d_meta.xlearner.ipynb. -__all__ = ['BaseXLearner', 'BaseXRegressor', 'BaseXClassifier', 'logger'] - -# Cell +# %% auto 0 +__all__ = ['logger', 'BaseXLearner', 'BaseXRegressor', 'BaseXClassifier'] +# %% ../../nbs/05d_meta.xlearner.ipynb 4 # REFERENCE: https://github.com/uber/causalml # Copyright 2019 Uber Technology, Inc. @@ -559,4 +559,4 @@ def predict(self, X, treatment=None, y=None, p=None, return_components=False, if not return_components: return te else: - return te, dhat_cs, dhat_ts \ No newline at end of file + return te, dhat_cs, dhat_ts diff --git a/causalnlp/preprocessing.py b/causalnlp/preprocessing.py index c9fdbda..572ae1c 100644 --- a/causalnlp/preprocessing.py +++ b/causalnlp/preprocessing.py @@ -1,8 +1,9 @@ -# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/04_preprocessing.ipynb (unless otherwise specified). +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/04_preprocessing.ipynb. +# %% auto 0 __all__ = ['DataframePreprocessor'] -# Cell +# %% ../nbs/04_preprocessing.ipynb 4 import numpy as np import pandas as pd pd.set_option('display.max_columns', 500) @@ -13,9 +14,9 @@ class DataframePreprocessor: """ Preproceses a pandas DataFrame for causal inference """ - def __init__(self, - treatment_col='treatment', - outcome_col='outcome', + def __init__(self, + treatment_col='treatment', + outcome_col='outcome', text_col=None, include_cols=[], ignore_cols=[], @@ -30,7 +31,7 @@ def __init__(self, self.ignore_cols = ignore_cols self.v = verbose - + # these variables set by preprocess self.feature_names = None self.feature_names_one_hot = None @@ -40,7 +41,7 @@ def __init__(self, self.is_classification = None - def preprocess(self, df, + def preprocess(self, df, training=False, min_df=0.05, max_df=0.5, @@ -61,8 +62,8 @@ def preprocess(self, df, raise ValueError('ignore_cols and include_cols are mutually exclusive. Please choose one.') if training and self.include_cols: self.ignore_cols = [c for c in df.columns.values if c not in self.include_cols +\ - [self.treatment_col, - self.outcome_col, + [self.treatment_col, + self.outcome_col, self.text_col]] if self.text_col is not None and self.text_col not in df: raise ValueError(f'You specified text_col="{self.text_col}", but {self.text_col} is not a column in df.') @@ -70,13 +71,13 @@ def preprocess(self, df, raise ValueError(f'ignore_cols contains the treatment column ({self.treatment_col})') if self.outcome_col in self.ignore_cols: raise ValueError(f'ignore_cols contains the outcome column ({self.outcome_col})') - + start_time = time.time() - + # step 1: check/clean dataframe if not isinstance(df, pd.DataFrame): raise ValueError('df must be a pandas DataFrame') - df = df.rename(columns=lambda x: x.strip()) # strip headers + df = df.rename(columns=lambda x: x.strip()) # strip headers # check and re-order test DataFrame if not training: test_feats = [col.strip() for col in df.columns.values if col.strip() in self.feature_names] @@ -85,27 +86,27 @@ def preprocess(self, df, if self.treatment_col not in df.columns: raise ValueError(f'Column {self.treatment_col} is missing from df.') if self.text_col is not None and self.text_col not in df.columns.values: - raise ValueError(f'Colummn {self.text_col} is missing from df') + raise ValueError(f'Colummn {self.text_col} is missing from df') df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) # strip data df, _ = self._preprocess_column(df, self.treatment_col, is_treatment=True) if training: - df, self.is_classification = self._preprocess_column(df, + df, self.is_classification = self._preprocess_column(df, self.outcome_col, is_treatment=False) self.feature_names = [c for c in df.columns.values \ - if c not in [self.treatment_col, + if c not in [self.treatment_col, self.outcome_col, self.text_col]+self.ignore_cols] for c in self.feature_names: self.feature_types[c] = self._check_type(df, c)['dtype'] X = df[self.feature_names].copy() Y = df[self.outcome_col].copy() if training else None - T = df[self.treatment_col].copy() + T = df[self.treatment_col].copy() # step 2: fill empty values on x for c in self.feature_names: - dtype = self.feature_types[c] + dtype = self.feature_types[c] if dtype == 'string': X[c] = X[c].fillna(na_cat_value) if dtype == 'numeric': X[c] = X[c].fillna(na_cont_value) - + # step 3: one-hot encode categorial features for c in self.feature_names: if c == self.text_col: continue @@ -119,26 +120,26 @@ def preprocess(self, df, err_msg = f'Column "{c}" looks like it contains free-form text or ' +\ f'or unique values. Please either set text_col="{c}" or add it to "ignore_cols" list.' raise ValueError(err_msg) - + if training: self.cat_dict[c] = sorted(X[c].unique()) catcol = X[c] else: #REF: https://stackoverflow.com/a/37451867/13550699 catcol = X[c].astype(pd.CategoricalDtype(categories=self.cat_dict[c])) - X = X.merge(pd.get_dummies(catcol, prefix = c, - drop_first=False), + X = X.merge(pd.get_dummies(catcol, prefix = c, + drop_first=False), left_index=True, right_index=True) - + del X[c] self.feature_names_one_hot = X.columns - - + + # step 4: for text-based confounder, use extracted vocabulary as features if self.text_col is not None: from sklearn.feature_extraction.text import TfidfVectorizer if training: - self.tv = TfidfVectorizer(min_df=min_df, max_df=max_df, + self.tv = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range, stop_words=stop_words) v_features = self.tv.fit_transform(df[self.text_col]) else: @@ -155,8 +156,8 @@ def preprocess(self, df, if self.v and self.text_col: print('text covariate: %s' % (self.text_col)) if self.v: print("preprocess time: ", -start_time + time.time()," sec") return (df, X, Y, T) - - + + def _preprocess_column(self, df, col, is_treatment=True): """ Preprocess treatment and outcome columns. @@ -171,10 +172,10 @@ def _preprocess_column(self, df, col, is_treatment=True): d = self._check_type(df, col) typ = d['dtype'] num = d['nunique'] - + # process as treatment if is_treatment: - if typ == 'numeric' or (typ == 'string' and num != 2): + if typ == 'numeric' or (typ == 'string' and num != 2): raise ValueError('Treatment column must contain only two unique values ' +\ 'indicating the treated and control groups.') values = sorted(df[col].unique()) @@ -190,13 +191,13 @@ def _preprocess_column(self, df, col, is_treatment=True): df[col].replace(values, [0,1], inplace=True) if self.v: print('replaced %s in column "%s" with %s' % (values, col, [0,1])) return df, self._check_binary(df, col) - - + + def _check_type(self, df, col): from pandas.api.types import is_string_dtype from pandas.api.types import is_numeric_dtype dtype = None - + tmp_var = df[df[col].notnull()][col] if is_numeric_dtype(tmp_var): dtype = 'numeric' elif is_string_dtype(tmp_var): dtype = 'string' @@ -205,11 +206,11 @@ def _check_type(self, df, col): 'Column %s is neither' % (col)) output = {'dtype' : dtype, 'nunique' : tmp_var.nunique()} return output - + def _check_binary(self, df, col): - return df[col].isin([0,1]).all() + return df[col].isin([0,1]).all() def _get_feature_names(self, df): return [c for c in df.columns.values \ - if c not in [self.treatment_col, self.outcome_col]+self.ignore_cols] \ No newline at end of file + if c not in [self.treatment_col, self.outcome_col]+self.ignore_cols] diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 57510a2..0000000 --- a/docs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -_site/ diff --git a/docs/Gemfile b/docs/Gemfile deleted file mode 100644 index 8a09b13..0000000 --- a/docs/Gemfile +++ /dev/null @@ -1,9 +0,0 @@ -source "https://rubygems.org" - -gem 'github-pages', group: :jekyll_plugins - -# Added at 2019-11-25 10:11:40 -0800 by jhoward: -gem "nokogiri", "< 1.11.1" -gem "jekyll", ">= 3.7" -gem "kramdown", ">= 2.3.1" -gem "jekyll-remote-theme" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock deleted file mode 100644 index 39d7f3c..0000000 --- a/docs/Gemfile.lock +++ /dev/null @@ -1,269 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (6.0.3.6) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.7.0) - public_suffix (>= 2.0.2, < 5.0) - coffee-script (2.4.1) - coffee-script-source - execjs - coffee-script-source (1.11.1) - colorator (1.1.0) - commonmarker (0.17.13) - ruby-enum (~> 0.5) - concurrent-ruby (1.1.8) - dnsruby (1.61.5) - simpleidn (~> 0.1) - em-websocket (0.5.2) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - ethon (0.12.0) - ffi (>= 1.3.0) - eventmachine (1.2.7) - execjs (2.7.0) - faraday (1.3.0) - faraday-net_http (~> 1.0) - multipart-post (>= 1.2, < 3) - ruby2_keywords - faraday-net_http (1.0.1) - ffi (1.15.0) - forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (214) - github-pages-health-check (= 1.17.0) - jekyll (= 3.9.0) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.6) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) - jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) - jekyll-mentions (= 1.6.0) - jekyll-optional-front-matter (= 0.3.2) - jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.16.0) - jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.3) - jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.7.1) - jekyll-sitemap (= 1.4.0) - jekyll-swiss (= 1.0.0) - jekyll-theme-architect (= 0.1.1) - jekyll-theme-cayman (= 0.1.1) - jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.2) - jekyll-theme-leap-day (= 0.1.1) - jekyll-theme-merlot (= 0.1.1) - jekyll-theme-midnight (= 0.1.1) - jekyll-theme-minimal (= 0.1.1) - jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.4) - jekyll-theme-slate (= 0.1.1) - jekyll-theme-tactile (= 0.1.1) - jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.1) - kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) - mercenary (~> 0.3) - minima (= 2.5.1) - nokogiri (>= 1.10.4, < 2.0) - rouge (= 3.26.0) - terminal-table (~> 1.4) - github-pages-health-check (1.17.0) - addressable (~> 2.3) - dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 2.0.2, < 5.0) - typhoeus (~> 1.3) - html-pipeline (2.14.0) - activesupport (>= 2) - nokogiri (>= 1.4) - http_parser.rb (0.6.0) - i18n (0.9.5) - concurrent-ruby (~> 1.0) - jekyll (3.9.0) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (~> 0.7) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 2.0) - kramdown (>= 1.17, < 3) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 4) - safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) - jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) - coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.3.1) - commonmarker (~> 0.14) - jekyll (>= 3.7, < 5.0) - jekyll-commonmark-ghpages (0.1.6) - commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1.2) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) - jekyll (>= 3.7, < 5.0) - jekyll-gist (1.5.0) - octokit (~> 4.2) - jekyll-github-metadata (2.13.0) - jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.6.0) - html-pipeline (~> 2.3) - jekyll (>= 3.7, < 5.0) - jekyll-optional-front-matter (0.3.2) - jekyll (>= 3.0, < 5.0) - jekyll-paginate (1.1.0) - jekyll-readme-index (0.3.0) - jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.16.0) - jekyll (>= 3.3, < 5.0) - jekyll-relative-links (0.6.1) - jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.3) - addressable (~> 2.0) - jekyll (>= 3.5, < 5.0) - jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) - rubyzip (>= 1.3.0, < 3.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.7.1) - jekyll (>= 3.8, < 5.0) - jekyll-sitemap (1.4.0) - jekyll (>= 3.7, < 5.0) - jekyll-swiss (1.0.0) - jekyll-theme-architect (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.2) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.4) - jekyll (> 3.5, < 5.0) - jekyll-github-metadata (~> 2.9) - jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.1.1) - jekyll (~> 3.5) - jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.3) - jekyll (>= 3.3, < 5.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) - html-pipeline (~> 2.2) - jekyll (>= 3.0, < 5.0) - kramdown (2.3.1) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - liquid (4.0.3) - listen (3.5.1) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - mini_portile2 (2.5.0) - minima (2.5.1) - jekyll (>= 3.5, < 5.0) - jekyll-feed (~> 0.9) - jekyll-seo-tag (~> 2.1) - minitest (5.14.4) - multipart-post (2.1.1) - nokogiri (1.11.0) - mini_portile2 (~> 2.5.0) - racc (~> 1.4) - octokit (4.20.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.6) - racc (1.5.2) - rb-fsevent (0.10.4) - rb-inotify (0.10.1) - ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby-enum (0.9.0) - i18n - ruby2_keywords (0.0.4) - rubyzip (2.3.0) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) - addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (1.2.9) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.7.7) - unicode-display_width (1.7.0) - zeitwerk (2.4.2) - -PLATFORMS - ruby - -DEPENDENCIES - github-pages - jekyll (>= 3.7) - jekyll-remote-theme - kramdown (>= 2.3.1) - nokogiri (< 1.11.1) - -BUNDLED WITH - 2.1.4 diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index e4169b1..0000000 --- a/docs/_config.yml +++ /dev/null @@ -1,67 +0,0 @@ -repository: amaiya/causalnlp -output: web -topnav_title: causalnlp -#default_badges: {colab: true} -site_title: causalnlp -company_name: Arun S. Maiya -description: CausalNLP -# Set to false to disable KaTeX math -use_math: true -# Add Google analytics id if you have one and want to use it here -google_analytics: -# See http://nbdev.fast.ai/search for help with adding Search -google_search: - -host: 127.0.0.1 -# the preview server used. Leave as is. -port: 4000 -# the port where the preview is rendered. - -exclude: - - .idea/ - - .gitignore - - vendor - -exclude: [vendor] - -highlighter: rouge -markdown: kramdown -kramdown: - input: GFM - auto_ids: true - hard_wrap: false - syntax_highlighter: rouge - -collections: - tooltips: - output: false - -defaults: - - - scope: - path: "" - type: "pages" - values: - layout: "page" - comments: true - search: true - sidebar: home_sidebar - topnav: topnav - - - scope: - path: "" - type: "tooltips" - values: - layout: "page" - comments: true - search: true - tooltip: true - -sidebars: -- home_sidebar - -plugins: - - jekyll-remote-theme - -remote_theme: fastai/nbdev-jekyll-theme -baseurl: /causalnlp/ diff --git a/docs/_data/sidebars/home_sidebar.yml b/docs/_data/sidebars/home_sidebar.yml deleted file mode 100644 index aea7779..0000000 --- a/docs/_data/sidebars/home_sidebar.yml +++ /dev/null @@ -1,63 +0,0 @@ - -################################################# -### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ### -################################################# -# Instead edit ../../sidebar.json -entries: -- folders: - - folderitems: - - output: web,pdf - title: Overview - url: / - - output: web,pdf - title: Causal Inference - url: core.causalinference.html - - output: web,pdf - title: CausalBert - url: core.causalbert.html - - output: web,pdf - title: Auto Coder - url: autocoder.html - - output: web,pdf - title: Analyzers - url: analyzers.html - - output: web,pdf - title: Key Driver Analysis - url: key_driver_analysis.html - - output: web,pdf - title: Preprocessing - url: preprocessing.html - - output: web,pdf - title: Base Metalearner - url: meta.base.html - - output: web,pdf - title: T-Learner - url: meta.tlearner.html - - output: web,pdf - title: S-Learner - url: meta.slearner.html - - output: web,pdf - title: X-Learner - url: meta.xlearner.html - - output: web,pdf - title: R-Learner - url: meta.rlearner.html - - output: web,pdf - title: Metalearner Utils - url: meta.utils.html - - output: web,pdf - title: Metalearner Explainer - url: meta.explainer.html - - output: web,pdf - title: Metalearner Propensity - url: meta.propensity.html - - output: web,pdf - title: Metalearner Sensitivity - url: meta.sensitivity.html - - output: web,pdf - title: Examples - url: examples.html - output: web - title: causalnlp - output: web - title: Sidebar diff --git a/docs/_data/topnav.yml b/docs/_data/topnav.yml deleted file mode 100644 index 389cd1f..0000000 --- a/docs/_data/topnav.yml +++ /dev/null @@ -1,10 +0,0 @@ -topnav: -- title: Topnav - items: - - title: github - external_url: https://github.com/amaiya/causalnlp/tree/main/ - -#Topnav dropdowns -topnav_dropdowns: -- title: Topnav dropdowns - folders: \ No newline at end of file diff --git a/docs/analyzers.html b/docs/analyzers.html deleted file mode 100644 index 410e882..0000000 --- a/docs/analyzers.html +++ /dev/null @@ -1,773 +0,0 @@ ---- - -title: Analyzers - - -keywords: fastai -sidebar: home_sidebar - -summary: "Text analyzers to help create text-based covariates, treatments, or outcomes for causal analyses." -description: "Text analyzers to help create text-based covariates, treatments, or outcomes for causal analyses." -nb_path: "nbs/02_analyzers.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

list2chunks[source]

list2chunks(a, n)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class ZeroShotClassifier[source]

ZeroShotClassifier(model_name='facebook/bart-large-mnli', device=None)

-
-

Interface to Zero Shot Topic Classifier

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

ZeroShotClassifier.predict[source]

ZeroShotClassifier.predict(docs, labels=[], include_labels=False, multilabel=True, max_length=512, batch_size=8, nli_template='This text is about {}.', topic_strings=[])

-
-

This method performs zero-shot text classification using Natural Language Inference (NLI).

-

Parameters:

-
    -
  • docs(list|str): text of document or list of texts
  • -
  • labels(list): a list of strings representing topics of your choice -
              Example:
    -           labels=['political science', 'sports', 'science']
    -
  • -
  • include_labels(bool): If True, will return topic labels along with topic probabilities
  • -
  • multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1. -
                If False, scores are normalized such that probabilities sum to 1.
    -
  • -
  • max_length(int): truncate long documents to this many tokens
  • -
  • batch_size(int): batch_size to use. default:8 -
               Increase this value to speed up predictions - especially
    -           if len(topic_strings) is large.
    -
  • -
  • nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference
  • -
  • topic_strings(list): alias for labels parameter for backwards compatibility
  • -
-

Returns:

-

inferred probabilities or list of inferred probabilities if doc is list

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
zsl = ZeroShotClassifier()
-labels=['politics', 'elections', 'sports', 'films', 'television']
-doc = 'I am extremely dissatisfied with the President and will definitely vote in 2020.'
-preds = zsl.predict(doc, labels=labels, include_labels=True)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
preds
-
- -
-
-
- -
-
- -
- - - -
-
[('politics', 0.979189932346344),
- ('elections', 0.9874580502510071),
- ('sports', 0.0005765454261563718),
- ('films', 0.002292441902682185),
- ('television', 0.001054605352692306)]
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
d = dict(preds)
-assert d['politics'] > 0.9
-assert d['elections'] > 0.9
-assert d['sports'] < 0.1
-assert d['films'] < 0.1
-assert d['television'] < 0.1
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class TextEncoder[source]

TextEncoder(model_name='stsb-roberta-large', device=None)

-
-

Tiny wrapper to sentence-transformers

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
te = TextEncoder()
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
e = te.encode('The moon is bright.')
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert e.shape[0] == 1
-assert e.shape[1] == 1024
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class TopicModel[source]

TopicModel(texts=None, n_topics=None, n_features=10000, min_df=5, max_df=0.5, stop_words='english', model_type='lda', lda_max_iter=5, lda_mode='online', token_pattern=None, verbose=1, hyperparam_kwargs=None)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
from sklearn.datasets import fetch_20newsgroups
-
-# we only want to keep the body of the documents!
-remove = ('headers', 'footers', 'quotes')
-
-# fetch train and test data
-newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
-newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
-
-# compile the texts
-texts = newsgroups_train.data +  newsgroups_test.data
-
-# let's also store the newsgroup category associated with each document
-# we can display this information in visualizations
-targets = [target for target in list(newsgroups_train.target) + list(newsgroups_test.target)]
-categories = [newsgroups_train.target_names[target] for target in targets]
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm = TopicModel(texts, n_features=10000)
-
- -
-
-
- -
-
- -
- -
-
n_topics automatically set to 97
-preprocessing texts...
-fitting model...
-iteration: 1 of max_iter: 5
-iteration: 2 of max_iter: 5
-iteration: 3 of max_iter: 5
-iteration: 4 of max_iter: 5
-iteration: 5 of max_iter: 5
-done.
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.print_topics()
-
- -
-
-
- -
-
- -
- -
-
topic 0 | tape adam tim case moved bag quote mass marked zionism
-topic 1 | image jpeg images format programs tiff files jfif save lossless
-topic 2 | alternative movie film static cycles films philips dynamic hou phi
-topic 3 | hell humans poster frank reality kent gerard gant eternal bell
-topic 4 | air phd chz kit cbc ups w-s rus w47 mot
-topic 5 | dog math great figure poster couldn don trying rushdie fatwa
-topic 6 | collaboration nazi fact end expression germany philly world certified moore
-topic 7 | gif points scale postscript mirror plane rendering algorithm polygon rayshade
-topic 8 | fonts font shell converted iii characters slight composite breaks compress
-topic 9 | power station supply options option led light tank plastic wall
-topic 10 | transmission rider bmw driver automatic shift gear japanese stick highway
-topic 11 | tyre ezekiel ruler hernia appeared appointed supreme man land power
-topic 12 | space nasa earth data launch surface solar moon mission planet
-topic 13 | israel jews jewish israeli arab peace war arabs palestinian kuwait
-topic 14 | olvwm xremote animals kinds roughing toolkit close corp glenn imakefile
-topic 15 | medical health disease cancer patients drug treatment drugs aids study
-topic 16 | biden chip gear like information number automatic mode insurance know
-topic 17 | graphics zip amiga shareware formats ftp gif program sgi convert
-topic 18 | brilliant mail did god coming christianity people got ideas reading
-topic 19 | black red white blue green cross wires lines helmet mask
-topic 20 | car engine cars miles clutch new ford rear slip road
-topic 21 | list mailing service model small large lists radar available major
-topic 22 | key encryption chip keys clipper phone security use government privacy
-topic 23 | talking pit nyr stl phi edm mtl wsh hfd cgy
-topic 24 | signal input switch connected circuit audio noise output control voltage
-topic 25 | stuff deleted die posting beware fantastic motives authentic reluctant hope
-topic 26 | adams douglas dc-x garrett ingres tin sdio incremental mcdonnell guide
-topic 27 | men homosexual homosexuality women gay sexual homosexuals male kinsey pop
-topic 28 | usual leo rs-232 martian reading cooperative unmanned somalia decompress visited
-topic 29 | edu university information send new computer research mail internet address
-topic 30 | reserve naval marine ret commission one-way irgun prior closure facilities
-topic 31 | state intelligence militia units army zone georgia sam croats belongs
-topic 32 | says article pain known warning doctor stone bug kidney response
-topic 33 | faq rsa ripem lights yes patent nist management wax cipher
-topic 34 | wolverine comics hulk appearance special liefeld sabretooth incredible hobgoblin x-force
-topic 35 | software ram worth cycles controller available make dram dynamic situation
-topic 36 | religion people religious catalog bobby used driven involved long like
-topic 37 | intel sites experiment ftp does know family good like mrs
-topic 38 | armenian people army russian turkish genocide armenians ottoman turks jews
-topic 39 | theft geo available face couldn cover sony people number shop
-topic 40 | christianity did exists mail matter mind tool status god reading
-topic 41 | propane probe earth orbit orbiter titan cassini space atmosphere gravity
-topic 42 | people government right think rights law make public fbi don
-topic 43 | god people does say believe bible true think evidence religion
-topic 44 | mov phone south key war supply push left just registered
-topic 45 | period goal pts play chicago pittsburgh buffalo shots new blues
-topic 46 | game team games year hockey season players player baseball league
-topic 47 | speed dod student technician just hits right note giant light
-topic 48 | sex marriage relationship family married couple depression pregnancy childhood trademark
-topic 49 | protects rejecting com4 couple decides taking connect unc nearest richer
-topic 50 | president states united american national press april washington america white
-topic 51 | card memory windows board ram bus drivers driver cpu problem
-topic 52 | window application manager display button xterm path widget event resources
-topic 53 | cable win van det bos tor cal nyi chi buf
-topic 54 | americans baltimore rochester cape springfield moncton providence utica binghamton adirondack
-topic 55 | color monitor screen mouse video colors resolution vga colour monitors
-topic 56 | option power ssf flights capability module redesign missions station options
-topic 57 | body father son vitamin diet day cells cell form literature
-topic 58 | max g9v b8f a86 bhj giz bxn biz qax b4q
-topic 59 | bit fast chip ibm faster mode chips scsi-2 speeds quadra
-topic 60 | book books law adl islam islamic iran media bullock muslims
-topic 61 | armenian russian turkish ottoman people army armenians genocide war turks
-topic 62 | oscillator partition tune nun umumiye nezareti mecmuasi muharrerat-i evrak version
-topic 63 | tongues seat est didn raise copied lazy schemes adapter leap
-topic 64 | com object jim app function motorola heterosexual objects pointers encountered
-topic 65 | effective boy projects grow jason ain dump keyboards vastly grants
-topic 66 | armenian people russian armenians turks ottoman army turkish genocide muslim
-topic 67 | mac apple pin ground wire quicktime macs pins connector simms
-topic 68 | bastard turning likes hooks notions turks cited proud pointers chuck
-topic 69 | bought dealer cost channel replaced face sony stereo warranty tube
-topic 70 | myers food reaction msg writes loop eat dee effects taste
-topic 71 | lander contradiction reconcile apparent somebody supplement essential needs produce insulin
-topic 72 | re-boost systems virginia voice unix input ken easily summary developing
-topic 73 | block tests suck shadow dte screws macedonia sunlight fin message
-topic 74 | jesus church christ god lord holy spirit mary shall heaven
-topic 75 | gun number year guns rate insurance police years new firearms
-topic 76 | rule automatically characteristic wider thumb recommendation inline mr2 halfway width
-topic 77 | drive disk hard scsi drives controller floppy ide master transfer
-topic 78 | stephanopoulos water gas oil heat energy hot temperature cold nuclear
-topic 79 | like know does use don just good thanks need want
-topic 80 | starters mlb mov higher signing left accessible argument viola teams
-topic 81 | entry rules info define entries year int printf include contest
-topic 82 | price new sale offer sell condition shipping interested asking prices
-topic 83 | issue germany title magazine german cover race generation origin nazi
-topic 84 | armenian armenians people turkish war said killed children russian turkey
-topic 85 | dos windows software comp library os/2 version microsoft applications code
-topic 86 | probe space launch titan earth cassini orbiter orbit atmosphere mission
-topic 87 | housed throws fills daylight occurring activities adjacent presenting punish occuring
-topic 88 | statement folk raids thor disarmed anatolia polygon inria arrive smehlik
-topic 89 | sound steve pro convert ati ultra fahrenheit orchid hercules blaster
-topic 90 | joke tricky wearing golden trickle seen geneva csh course caesar
-topic 91 | moral objective values morality child defined bank definition wrong different
-topic 92 | files file edu ftp available version server data use sun
-topic 93 | catalog tons seal ordering kawasaki tools fax free ultraviolet packages
-topic 94 | file program error output use section line code command problem
-topic 95 | power ssf module capability option flights redesign missions human station
-topic 96 | just don think know like time did going didn people
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.build(texts)
-
- -
-
-
- -
-
- -
- -
-
done.
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
texts[1]
-
- -
-
-
- -
-
- -
- - - -
-
"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks."
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.doc_topics[1]
-
- -
-
-
- -
-
- -
- - - -
-
array([0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.05935853, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.04939132, 0.00105197,
-       0.00105197, 0.00105197, 0.04181867, 0.00105197, 0.00105197,
-       0.00105197, 0.21681858, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.02146013, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.0458702 , 0.02146013, 0.14892628,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
-       0.00105197, 0.00105197, 0.13724779, 0.00105197, 0.00105197,
-       0.00105197, 0.16612722])
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.topics[ np.argmax(tm.doc_topics[1])]
-
- -
-
-
- -
-
- -
- - - -
-
'card memory windows board ram bus drivers driver cpu problem'
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees '  +
-            'the development and manufacturing of advanced rockets and spacecraft for missions ' +
-            'to and beyond Earth orbit.'])
-
- -
-
-
- -
-
- -
- - - -
-
array([[0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.65009096, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.06185567, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
-        0.00303214, 0.00303214]])
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
tm.topics[ np.argmax(tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees '  +
-            'the development and manufacturing of advanced rockets and spacecraft for missions ' +
-            'to and beyond Earth orbit.']))]
-
- -
-
-
- -
-
- -
- - - -
-
'space nasa earth data launch surface solar moon mission planet'
-
- -
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/autocoder.html b/docs/autocoder.html deleted file mode 100644 index 0d8c661..0000000 --- a/docs/autocoder.html +++ /dev/null @@ -1,2863 +0,0 @@ ---- - -title: Auto Coder - - -keywords: fastai -sidebar: home_sidebar - -summary: "Automatically codes text fields such as open-ended survey questions based on lingustic properties such as topic and sentiment." -description: "Automatically codes text fields such as open-ended survey questions based on lingustic properties such as topic and sentiment." -nb_path: "nbs/01_autocoder.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class Autocoder[source]

Autocoder(verbose=1, device=None)

-
-

Autocodes text fields

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_sentiment[source]

Autocoder.code_sentiment(docs, df, batch_size=8, binarize=False, threshold=0.5)

-
-

Autocodes text for positive or negative sentiment

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Let's prepare a toy dataset:

- -
-
-
- {% raw %} - -
-
- -
-
-
ac = Autocoder()
-reviews = ["I loved this doctor!", "This doctor was absolutely terrible."]
-df = pd.DataFrame({
-    'gender': ['female', 'male'],
-     'review' : reviews,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - -
genderreview
0femaleI loved this doctor!
1maleThis doctor was absolutely terrible.
-
- -
- -
-
- -
- {% endraw %} - -
-
-

After autocoding for sentiment, the dataframe now has extra columns:

- -
-
-
- {% raw %} - -
-
- -
-
-
result_df = ac.code_sentiment(df['review'].values, df)
-result_df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
genderreviewnegativepositive
0femaleI loved this doctor!0.0050340.994966
1maleThis doctor was absolutely terrible.0.9817890.018211
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert result_df[result_df['gender']=='female']['negative'].values[0] < 0.1
-assert result_df[result_df['gender']=='female']['positive'].values[0] > 0.9
-assert result_df[result_df['gender']=='male']['negative'].values[0] > 0.9
-assert result_df[result_df['gender']=='male']['positive'].values[0] < 0.1
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_custom_topics[source]

Autocoder.code_custom_topics(docs, df, labels, batch_size=8, binarize=False, threshold=0.5)

-
-

Autocodes text for user-specified topics. -The label field is the name of the topic as a string (or a list of them.)

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Let's prepare a toy dataset:

- -
-
-
- {% raw %} - -
-
- -
-
-
comments = ["What is your favorite sitcom of all time?", 'I cannot wait to vote!']
-df = pd.DataFrame({
-    'over_18': ['yes', 'no'],
-     'comments' : comments,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - -
over_18comments
0yesWhat is your favorite sitcom of all time?
1noI cannot wait to vote!
-
- -
- -
-
- -
- {% endraw %} - -
-
-

After autocoding, the dataframe has a new column for each custom topic:

- -
-
-
- {% raw %} - -
-
- -
-
-
result_df = ac.code_custom_topics(df['comments'].values, df, labels=['television', 'film', 'politics'])
-result_df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
over_18commentstelevisionfilmpolitics
0yesWhat is your favorite sitcom of all time?0.9813270.0122600.000157
1noI cannot wait to vote!0.0005180.0049430.936988
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert result_df[result_df['over_18']=='yes']['television'].values[0] > 0.9
-assert result_df[result_df['over_18']=='yes']['film'].values[0] < 0.1
-assert result_df[result_df['over_18']=='yes']['politics'].values[0] < 0.1
-assert result_df[result_df['over_18']=='no']['television'].values[0] < 0.1
-assert result_df[result_df['over_18']=='no']['film'].values[0] < 0.1
-assert result_df[result_df['over_18']=='no']['politics'].values[0] > 0.9
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_emotion[source]

Autocoder.code_emotion(docs, df, batch_size=8, binarize=False, threshold=0.5)

-
-

Autocodes text for emotion

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
comments = ["I'm nervous about tomorrow.", 'I got a promotion at work!',
-            "My best friend was in a car accident.", "I hate it when I'm cut off in traffic."]
-df = pd.DataFrame({
-    'over_18': ['yes', 'no', 'yes', 'yes'],
-     'comments' : comments,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
over_18comments
0yesI'm nervous about tomorrow.
1noI got a promotion at work!
2yesMy best friend was in a car accident.
3yesI hate it when I'm cut off in traffic.
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
result_df = ac.code_emotion(df['comments'].values, df, binarize=True)
-result_df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
over_18commentsjoyangerfearsadness
0yesI'm nervous about tomorrow.0010
1noI got a promotion at work!1000
2yesMy best friend was in a car accident.0001
3yesI hate it when I'm cut off in traffic.0100
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert result_df.iloc[0]['fear'] == 1
-assert result_df.iloc[1]['joy'] == 1
-assert result_df.iloc[2]['sadness'] == 1
-assert result_df.iloc[3]['anger'] == 1
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_transformer[source]

Autocoder.code_transformer(docs, df, batch_size=32, model_name='stsb-roberta-large', show_progress_bar=False)

-
-

Encode texts as semantically meaningful vectors using a Transformer model

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
reviews = ["I loved this doctor!", "This doctor was absolutely terrible."]
-df = pd.DataFrame({
-    'gender': ['female', 'male'],
-     'review' : reviews,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - -
genderreview
0femaleI loved this doctor!
1maleThis doctor was absolutely terrible.
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df = ac.code_transformer(df.review.values, df)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
genderreviewe_0000e_0001e_0002e_0003e_0004e_0005e_0006e_0007e_0008e_0009e_0010e_0011e_0012e_0013e_0014e_0015e_0016e_0017e_0018e_0019e_0020e_0021e_0022e_0023e_0024e_0025e_0026e_0027e_0028e_0029e_0030e_0031e_0032e_0033e_0034e_0035e_0036e_0037e_0038e_0039e_0040e_0041e_0042e_0043e_0044e_0045e_0046e_0047e_0048e_0049e_0050e_0051e_0052e_0053e_0054e_0055e_0056e_0057e_0058e_0059e_0060e_0061e_0062e_0063e_0064e_0065e_0066e_0067e_0068e_0069e_0070e_0071e_0072e_0073e_0074e_0075e_0076e_0077e_0078e_0079e_0080e_0081e_0082e_0083e_0084e_0085e_0086e_0087e_0088e_0089e_0090e_0091e_0092e_0093e_0094e_0095e_0096e_0097e_0098e_0099e_0100e_0101e_0102e_0103e_0104e_0105e_0106e_0107e_0108e_0109e_0110e_0111e_0112e_0113e_0114e_0115e_0116e_0117e_0118e_0119e_0120e_0121e_0122e_0123e_0124e_0125e_0126e_0127e_0128e_0129e_0130e_0131e_0132e_0133e_0134e_0135e_0136e_0137e_0138e_0139e_0140e_0141e_0142e_0143e_0144e_0145e_0146e_0147e_0148e_0149e_0150e_0151e_0152e_0153e_0154e_0155e_0156e_0157e_0158e_0159e_0160e_0161e_0162e_0163e_0164e_0165e_0166e_0167e_0168e_0169e_0170e_0171e_0172e_0173e_0174e_0175e_0176e_0177e_0178e_0179e_0180e_0181e_0182e_0183e_0184e_0185e_0186e_0187e_0188e_0189e_0190e_0191e_0192e_0193e_0194e_0195e_0196e_0197e_0198e_0199e_0200e_0201e_0202e_0203e_0204e_0205e_0206e_0207e_0208e_0209e_0210e_0211e_0212e_0213e_0214e_0215e_0216e_0217e_0218e_0219e_0220e_0221e_0222e_0223e_0224e_0225e_0226e_0227e_0228e_0229e_0230e_0231e_0232e_0233e_0234e_0235e_0236e_0237e_0238e_0239e_0240e_0241e_0242e_0243e_0244e_0245e_0246e_0247...e_0774e_0775e_0776e_0777e_0778e_0779e_0780e_0781e_0782e_0783e_0784e_0785e_0786e_0787e_0788e_0789e_0790e_0791e_0792e_0793e_0794e_0795e_0796e_0797e_0798e_0799e_0800e_0801e_0802e_0803e_0804e_0805e_0806e_0807e_0808e_0809e_0810e_0811e_0812e_0813e_0814e_0815e_0816e_0817e_0818e_0819e_0820e_0821e_0822e_0823e_0824e_0825e_0826e_0827e_0828e_0829e_0830e_0831e_0832e_0833e_0834e_0835e_0836e_0837e_0838e_0839e_0840e_0841e_0842e_0843e_0844e_0845e_0846e_0847e_0848e_0849e_0850e_0851e_0852e_0853e_0854e_0855e_0856e_0857e_0858e_0859e_0860e_0861e_0862e_0863e_0864e_0865e_0866e_0867e_0868e_0869e_0870e_0871e_0872e_0873e_0874e_0875e_0876e_0877e_0878e_0879e_0880e_0881e_0882e_0883e_0884e_0885e_0886e_0887e_0888e_0889e_0890e_0891e_0892e_0893e_0894e_0895e_0896e_0897e_0898e_0899e_0900e_0901e_0902e_0903e_0904e_0905e_0906e_0907e_0908e_0909e_0910e_0911e_0912e_0913e_0914e_0915e_0916e_0917e_0918e_0919e_0920e_0921e_0922e_0923e_0924e_0925e_0926e_0927e_0928e_0929e_0930e_0931e_0932e_0933e_0934e_0935e_0936e_0937e_0938e_0939e_0940e_0941e_0942e_0943e_0944e_0945e_0946e_0947e_0948e_0949e_0950e_0951e_0952e_0953e_0954e_0955e_0956e_0957e_0958e_0959e_0960e_0961e_0962e_0963e_0964e_0965e_0966e_0967e_0968e_0969e_0970e_0971e_0972e_0973e_0974e_0975e_0976e_0977e_0978e_0979e_0980e_0981e_0982e_0983e_0984e_0985e_0986e_0987e_0988e_0989e_0990e_0991e_0992e_0993e_0994e_0995e_0996e_0997e_0998e_0999e_1000e_1001e_1002e_1003e_1004e_1005e_1006e_1007e_1008e_1009e_1010e_1011e_1012e_1013e_1014e_1015e_1016e_1017e_1018e_1019e_1020e_1021e_1022e_1023
0femaleI loved this doctor!-0.6011800.639239-1.060369-0.493731-0.560601-1.008939-0.598373-0.672984-0.6407090.035109-0.3948581.125174-0.8097090.092503-1.561161-0.338891-0.980971-0.218150-0.7702180.518710-0.154178-0.465516-0.6360970.136777-0.6710580.8874001.150700-0.255780-0.124600-1.695019-0.176871-0.5545250.4202711.104315-0.662254-1.104489-0.150348-0.328107-0.265295-0.232560-0.7322000.1028511.9202830.3450620.727855-0.558262-0.7278790.068228-0.288561-1.3769030.480348-0.951236-0.184960-0.977992-0.494253-0.1428200.1861240.165433-0.0546850.401775-0.606251-0.4003750.273657-0.3473730.4304650.691614-0.515043-0.0891490.224054-0.449324-0.1940170.594868-0.614699-0.372429-0.152741-0.0660521.074707-0.8100090.675266-0.6094820.561731-0.939348-0.691044-0.9950840.166328-1.531809-0.379524-0.498860-0.741533-0.4136291.733109-0.791184-0.098716-1.2333200.1377900.9388240.544055-1.0248580.578154-0.508842-1.0234410.5978451.085201-1.700814-0.9308980.512371-1.246665-0.3100880.550669-1.0522630.829993-0.637790-0.438172-0.5685370.722001-0.957278-0.768909-0.1607051.836634-0.5814770.4889770.3475040.7836550.589048-0.7704690.439723-0.4087670.2952091.1492680.1605610.342767-1.275258-0.0754610.347347-1.197512-1.3467580.052439-1.9963780.061255-0.809439-0.636264-0.5216080.2096661.2013791.3041540.8589281.3730420.723125-0.4440270.397904-1.1853890.3090250.1011400.790087-0.622007-0.557396-1.449296-0.3101371.2940560.66767-1.077920-0.054805-0.5713641.299067-0.331780-0.8400441.2820670.4256451.46890-0.6629420.3120711.4208560.0849830.438224-0.310173-0.9818180.668649-1.796632-0.4765230.1715810.08128-1.0558690.7311450.0827700.402360-0.1115071.0526060.101429-0.436716-0.689745-0.359305-0.8498180.102386-0.674699-0.6323860.635284-0.4542860.002086-0.698927-1.2612980.795101-0.073547-0.3258370.421853-1.6209931.901134-0.371985-1.0750060.779401-0.9817261.718573-0.156533-1.5014770.638842-0.603821-0.441458-0.4199341.299583-0.3290410.1870531.4767160.8418901.3788841.4159930.4902280.93683-1.134727-1.298774-0.237284-0.639338-0.062777-0.571427-0.696611-1.6742790.2001180.5667581.2580070.281263-0.2273860.403024-0.913720-0.332624-1.145163-1.3734160.726468-0.116224-1.0800731.629549...-0.5972580.473389-0.087902-0.734512-0.1921771.0983240.252797-0.2203800.9708340.3796410.7025790.312840-0.0148650.076790-0.9267110.283459-0.201210-1.5075441.0131600.399853-0.560346-0.4324600.7387940.2710190.7580120.1049480.032012-1.1182630.817341-0.134954-0.367428-1.0955111.424716-0.45837-1.0052591.168612-0.739624-0.778042-0.3567350.4704580.1813060.867469-0.033199-0.0597420.067898-0.3965841.678158-0.8867950.4317720.239491-0.3982060.357574-0.6494860.8849560.774565-0.0919670.539807-0.0988390.4074670.0224930.596556-2.279631-1.012586-0.5154141.0084940.0244490.786387-0.039095-0.2824671.2106150.0090270.694995-0.778203-0.434733-0.5461210.111783-0.414437-0.186292-0.9243110.771270-0.726940-0.002945-0.904097-0.78010-1.3443930.4190250.236579-0.1475060.4229310.268999-1.120625-2.3463390.0592630.432407-0.0291690.342242-0.2277180.429898-0.4874600.215381-1.7555920.5718061.145492-0.5952260.279368-1.833523-0.318555-0.3342401.5460890.9961790.3653550.7957560.931366-1.3288362.2218190.5337930.4196470.6070961.1482810.962832-0.6275070.023852-0.9770260.372186-0.191951-0.2614941.2797360.7434370.3129430.249434-1.020184-0.526093-0.145118-1.2249160.0138930.314860-0.184937-0.3251641.3663730.2746570.026925-0.244764-0.0874592.440723-0.2114441.791491-1.7837601.172868-1.5885790.5474281.2364030.2387651.0740800.9718041.481358-0.260144-0.372862-1.6688350.8141270.459048-0.537239-1.363500-1.9370480.223611-0.0939470.2061381.323856-0.8814260.858833-0.481818-1.634061.143431-0.822667-0.3892360.754676-0.4743681.164978-1.2494320.841197-0.2711010.239336-0.874708-0.4846081.776312-0.655398-0.5954011.292877-0.6730881.1837251.045448-0.711501-0.435948-0.414408-0.820870.1259830.0924120.571426-1.3696500.498595-0.114022-2.056757-0.606038-0.014727-1.732948-0.208160-0.2579680.3362720.292738-1.0208950.707942-0.4130660.015892-0.8706560.356665-1.2406250.697207-0.899096-0.5462831.3460670.1515490.608179-0.642331-0.4913671.476060-0.2393410.2100750.6538710.124511-1.4507960.1317110.597644-0.2396550.151939-0.9892971.1201320.0863770.172451-1.515352-0.4225611.6188941.162732-0.041656-0.4737720.420647-0.4828610.206311-0.8063560.864795-0.179643-0.095540
1maleThis doctor was absolutely terrible.-1.0803211.2837100.032944-0.505388-0.6322840.2407790.4977000.061434-0.951467-1.0999140.3717871.267668-0.751966-0.042724-0.1420160.127234-0.733424-1.139796-0.3250700.430322-0.0980041.1630771.0571900.532064-0.054028-0.3447831.0421960.1325360.173455-0.846880-0.294927-1.092173-0.7391570.072505-1.381498-0.039768-0.596037-0.635421-0.102166-0.223891-0.1106681.6100510.1244950.2625220.4711820.3639860.1492841.757610-0.095173-0.828335-0.169187-0.1673540.181549-0.4680740.173165-0.1514720.153541-0.070349-0.0706821.3468130.838431-0.173599-0.698330-0.9070780.686929-0.253123-0.253507-0.8162850.577228-0.471222-0.3195030.318208-1.1523131.6080940.0203860.2408811.051513-0.431564-0.7340530.355924-0.735063-1.024491-0.607373-0.363772-1.032262-0.755497-1.072544-0.3303460.112159-0.7658532.702498-0.059790-2.331072-0.2614090.662297-0.1348032.094935-1.216020-1.468843-0.590109-0.6033790.032229-0.734086-1.041735-0.0968810.252744-0.755398-0.196471-0.6734080.3231160.4851700.8522330.0380430.1065031.900742-0.4739680.440853-0.1242180.818130-0.2499000.174284-2.027710-0.841279-0.510334-1.589421-0.064431-0.2041340.107323-0.129780-0.373625-0.085754-0.3891580.6304510.811590-1.157425-0.0366670.638930-0.0318280.162673-0.7457010.0473400.0419560.4555311.466353-0.4932030.3151980.9564630.169743-0.9037401.078133-0.6391520.206805-1.2127010.061930-1.5870890.509692-0.5807040.7431370.4392200.11038-1.2474471.3239400.4044720.451868-1.951448-2.136478-0.8246890.5207470.87729-0.3656770.6085081.2913220.1417760.6687820.493870-0.911925-0.265987-0.3425150.059859-0.457266-0.244781.999361-0.0125800.126561-0.4439191.152566-0.219918-0.358424-0.2155550.1699460.1934130.4254130.506095-2.375514-0.682047-0.2127790.261091-0.382527-0.4230460.0875690.485063-0.3426600.4559860.331639-1.6484971.399007-0.5948000.471352-0.7419820.568690-0.5373441.354499-1.5215430.2226860.505541-0.3844660.0489470.243410-1.0031860.4426021.2569650.7188531.4583851.336809-1.110115-0.28113-0.0214410.969155-0.324079-0.551153-0.346971-0.426813-0.909856-0.2245910.5192700.4363780.5570020.6159460.307261-0.292611-0.646692-0.091192-0.1241680.0447920.370954-1.421038-1.3210871.192953...-0.160493-1.280425-0.7698620.573256-1.2979331.4924511.2445440.312218-0.6207410.3679662.4169982.586343-1.1355450.8969540.391467-0.6747750.383277-0.9505781.830727-1.018144-0.007086-0.4910240.5202390.6753521.206401-1.113754-1.293386-0.9286700.7358770.426821-0.453119-0.5054700.643926-0.40995-1.265347-0.0863700.149850-0.0145410.1525790.2141340.1909000.483520-0.1211190.216187-0.0957050.484240-0.2564380.1287060.1241240.442363-0.3288520.839022-0.413680-0.2183010.031112-0.7815770.8773760.4261510.650736-0.5343631.324010-2.276321-3.2098080.747673-0.090331-0.7947440.9102270.0642110.187118-0.292773-0.7518700.891957-0.681515-1.061648-0.5733870.548157-0.167158-0.570218-0.1153140.747868-0.937214-0.019237-1.126545-0.36322-1.2342320.423862-0.2699320.5761940.8495810.444871-0.502688-1.018462-0.920363-0.202659-0.4564581.216924-0.1851810.4860690.2670840.5853350.036500-0.0486801.431088-0.1418620.566101-1.2383890.0729490.038206-0.2939411.5364630.458766-0.149625-0.717818-0.0797801.7018690.439535-0.1746740.958559-0.0547500.944752-0.0188440.701800-0.7699890.2530600.769639-0.6076090.6963540.1711431.1060530.268299-1.0479650.6401540.143615-1.105975-0.0162270.1424680.596629-0.452742-0.313863-0.227832-0.207953-0.843668-1.5027741.050109-0.0421790.633935-0.994892-0.309290-1.750694-1.035756-0.8934230.4391060.4684170.3322140.6155650.167857-0.761188-0.513775-0.7272990.2331100.549183-1.956708-0.498497-0.176335-1.125636-0.663086-0.504846-0.2848071.4123281.304304-0.673631.146111-1.070053-0.5989150.518672-0.419871-0.001672-0.9151211.0481801.200090-1.123845-0.956011-0.7798011.2263840.2999320.497791-0.184537-0.0283790.1855980.613601-0.006552-0.3405420.135926-0.15309-0.933908-0.3275881.260057-0.727343-0.0199710.352552-0.667697-1.120148-0.2577280.3430140.514783-1.494829-0.767745-0.098165-0.5325861.3007450.445362-0.5910720.4727840.128228-0.951936-0.301227-0.8290750.3564932.177831-0.4537400.180738-0.3661110.788271-0.376016-0.1677960.9450920.318102-0.313438-0.521864-0.8046450.371298-0.102799-0.398658-0.6749320.7127330.402257-0.189253-1.7440410.592453-0.1014461.562682-0.446034-0.0733160.778162-0.6702580.576500-0.036422-0.237191-0.103962-0.018753
-

2 rows × 1026 columns

-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_lda_topics[source]

Autocoder.code_lda_topics(docs, df, k=10, n_features=10000)

-
-

Encode texts as semantically meaningful vectors using Latent Dirichlet Alocation

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
comments = ["What is your favorite sitcom of all time?", 'I cannot wait to vote!']
-df = pd.DataFrame({
-    'over_18': ['yes', 'no'] * 5,
-     'comments' : comments * 5,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
over_18comments
0yesWhat is your favorite sitcom of all time?
1noI cannot wait to vote!
2yesWhat is your favorite sitcom of all time?
3noI cannot wait to vote!
4yesWhat is your favorite sitcom of all time?
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df = ac.code_lda_topics(df['comments'].values, df)
-
- -
-
-
- -
-
- -
- -
-
preprocessing texts...
-fitting model...
-iteration: 1 of max_iter: 5
-iteration: 2 of max_iter: 5
-iteration: 3 of max_iter: 5
-iteration: 4 of max_iter: 5
-iteration: 5 of max_iter: 5
-done.
-done.
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
over_18commentstopic_0000topic_0001topic_0002topic_0003topic_0004topic_0005topic_0006topic_0007topic_0008topic_0009
0yesWhat is your favorite sitcom of all time?0.1487630.0933410.0807230.1289110.1098160.0847240.0936110.0808600.0917580.087493
1noI cannot wait to vote!0.0856870.0977490.1424860.0841450.0869310.0996080.0919130.1147410.0930140.103728
2yesWhat is your favorite sitcom of all time?0.1487630.0933410.0807230.1289110.1098160.0847240.0936110.0808600.0917580.087493
3noI cannot wait to vote!0.0856870.0977490.1424860.0841450.0869310.0996080.0919130.1147410.0930140.103728
4yesWhat is your favorite sitcom of all time?0.1487630.0933410.0807230.1289110.1098160.0847240.0936110.0808600.0917580.087493
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

Autocoder.code_callable[source]

Autocoder.code_callable(docs, df, fn)

-
-

Autocodes text for any user-specified function -The fn parameter must be a Callable and return a dictionary for each -text in docs where the keys are desired column names and values are scores -or probabilities.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
reviews = ["I loved this doctor!", "This doctor was absolutely terrible."]
-df = pd.DataFrame({
-    'gender': ['female', 'male'],
-     'review' : reviews,
-      })
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - -
genderreview
0femaleI loved this doctor!
1maleThis doctor was absolutely terrible.
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
def some_function(x):
-    val = int('terrible' in x)
-    return {'has_the_word_terrible?' : val}
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df = ac.code_callable(df.review.values, df, some_function)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - -
genderreviewhas_the_word_terrible?
0femaleI loved this doctor!0
1maleThis doctor was absolutely terrible.1
-
- -
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/causalinference.html b/docs/causalinference.html deleted file mode 100644 index 95c39e7..0000000 --- a/docs/causalinference.html +++ /dev/null @@ -1,1900 +0,0 @@ ---- - -title: Causal Inference - - -keywords: fastai -sidebar: home_sidebar - -summary: "Causal Inference API" -description: "Causal Inference API" -nb_path: "nbs/00_causalinference.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalInferenceModel[source]

CausalInferenceModel(df, method='t-learner', metalearner_type=None, treatment_col='treatment', outcome_col='outcome', text_col=None, ignore_cols=[], include_cols=[], treatment_effect_col='treatment_effect', learner=None, effect_learner=None, min_df=0.05, max_df=0.5, ngram_range=(1, 1), stop_words='english', verbose=1)

-
-

Infers causality from the data contained in df using a metalearner.

-

Usage:

-
>>> cm = CausalInferenceModel(df,
-                              treatment_col='Is_Male?',
-                              outcome_col='Post_Shared?', text_col='Post_Text',
-                              ignore_cols=['id', 'email'])
-    cm.fit()
-
-

Parameters:

-
    -
  • df : pandas.DataFrame containing dataset
  • -
  • method : metalearner model to use. One of {'t-learner', 's-learner', 'x-learner', 'r-learner'} (Default: 't-learner')
  • -
  • metalearner_type : Alias of method for backwards compatibility. Overrides method if not None.
  • -
  • treatment_col : treatment variable; column should contain binary values: 1 for treated, 0 for untreated.
  • -
  • outcome_col : outcome variable; column should contain the categorical or numeric outcome values
  • -
  • text_col : (optional) text column containing the strings (e.g., articles, reviews, emails).
  • -
  • ignore_cols : columns to ignore in the analysis
  • -
  • include_cols : columns to include as covariates (e.g., possible confounders)
  • -
  • treatment_effect_col : name of column to hold causal effect estimations. Does not need to exist. Created by CausalNLP.
  • -
  • learner : an instance of a custom learner. If None, Log/Lin Regression is used for S-Learner -
              and a default LightGBM model will be used for all other metalearner types.
    -
    - # Example - learner = LGBMClassifier(num_leaves=1000)
  • -
  • effect_learner: used for x-learner/r-learner and must be regression model
  • -
  • min_df : min_df parameter used for text processing using sklearn
  • -
  • max_df : max_df parameter used for text procesing using sklearn
  • -
  • ngram_range: ngrams used for text vectorization. default: (1,1)
  • -
  • stop_words : stop words used for text processing (from sklearn)
  • -
  • verbose : If 1, print informational messages. If 0, suppress.
  • -
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.fit[source]

CausalInferenceModel.fit(p=None)

-
-

Fits a causal inference model and estimates outcome -with and without treatment for each observation. -For X-Learner and R-Learner, propensity scores will be computed -using default propensity model unless p is not None. -Parameter p is not used for other methods.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.tune_and_use_default_learner[source]

CausalInferenceModel.tune_and_use_default_learner(split_pct=0.2, random_state=314, scoring=None)

-
-

Tunes the hyperparameters of a default LightGBM model, replaces CausalInferenceModel.learner, -and returns best parameters. -Should be invoked prior to running CausalInferencemodel.fit. -If scoring is None, then 'roc_auc' is used for classification and 'negative_mean_squared_error' -is used for regresssion.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.predict[source]

CausalInferenceModel.predict(df, p=None)

-
-

Estimates the treatment effect for each observation in df. -The DataFrame represented by df should be the same format -as the one supplied to CausalInferenceModel.__init__. -For X-Learner and R-Learner, propensity scores will be computed -using default propensity model unless p is not None. -Parameter p is not used for other methods.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.get_required_columns[source]

CausalInferenceModel.get_required_columns()

-
-

Returns required columns that must exist in any DataFrame supplied to CausalInferenceModel.predict.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.estimate_ate[source]

CausalInferenceModel.estimate_ate(bool_mask=None)

-
-

Estimates the treatment effect for each observation in -self.df.

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

The bool_mask parameter can be used to estimate the conditional average treatment estimate (CATE). -For instance, to estimate the average treatment effect for only those individuals over 18 years of age:

-
cm.estimate_ate(cm.df['age']>18])
-
- -
-
-
- {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.evaluate_robustness[source]

CausalInferenceModel.evaluate_robustness(sample_size=0.8)

-
-

Evaluates robustness on four sensitivity measures (see CausalML package for details on these methods):

-
    -
  • Placebo Treatment: ATE should become zero.
  • -
  • Random Cause: ATE should not change.
  • -
  • Random Replacement: ATE should not change.
  • -
  • Subset Data: ATE should not change.
  • -
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.interpret[source]

CausalInferenceModel.interpret(plot=False, method='feature_importance')

-
-

Returns feature importances of treatment effect model. -The method parameter must be one of {'feature_importance', 'shap_values'}

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.explain[source]

CausalInferenceModel.explain(df, row_index=None, row_num=0, background_size=50, nsamples=500)

-
-

Explain the treatment effect estimate of a single observation using SHAP.

-

Parameters:

-
    -
  • df (pd.DataFrame): a pd.DataFrame of test data is same format as original training data DataFrame
  • -
  • row_num (int): raw row number in DataFrame to explain (default:0, the first row)
  • -
  • background_size (int): size of background data (SHAP parameter)
  • -
  • nsamples (int): number of samples (SHAP parameter)
  • -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Usage Example: Do social media posts by women get shared more often than those by men?

Let's create a simulated dataset.

- -
-
-
- {% raw %} - -
-
- -
-
-
import itertools
-import pandas as pd
-data = ((*a, b) for (a, b) in zip(itertools.product([0,1], [0,1], [0,1]), [36, 234, 25, 55, 6, 81, 71, 192]))
-df = pd.DataFrame(data, columns=['Is_Male?', 'Post_Text', 'Post_Shared?', 'N'])
-df = df.loc[df.index.repeat(df['N'])].reset_index(drop=True).drop(columns=['N'])
-values = sorted(df['Post_Text'].unique())
-df['Post_Text'].replace(values, ['I really love my job!', 'My boss is pretty terrible.'], inplace=True)
-original_df = df.copy()
-df = None
-original_df.head()
-
- -
-
-
- -
-
- -
- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?
00I really love my job!0
10I really love my job!0
20I really love my job!0
30I really love my job!0
40I really love my job!0
-
-
- -
- -
-
- -
- {% endraw %} - -
-
-

At first glance, it seems like posts by women get shared more often. More specifically, it appears that being male reduces your the chance your post is shared by 4.5 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
male_probability = original_df[(original_df['Is_Male?']==1)]['Post_Shared?'].value_counts(normalize=True)[1]
-male_probability
-
- -
-
-
- -
-
- -
- - - -
-
0.78
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
female_probability = original_df[(original_df['Is_Male?']==0)]['Post_Shared?'].value_counts(normalize=True)[1]
-female_probability
-
- -
-
-
- -
-
- -
- - - -
-
0.8257142857142857
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
male_probability-female_probability
-
- -
-
-
- -
-
- -
- - - -
-
-0.04571428571428571
-
- -
- -
-
- -
- {% endraw %} - -
-
-

However, this is inaccurate. In fact, this is an example of Simpson's Paradox, and the true causal effect of being male in this simulated datsaet is roughly 0.05 (as opposed to -0.045) with men's posts being more likely to be shared. The reason is that women in this simulation tend to make more positive posts which tend to be shared more often here. Post sentiment, then, is a mediator, which is statistically statistically similar to a confounder.

-

When controlling for the sentiment of the post (the mediator variable in this dataset), it is revealed that men's posts are, in fact, shared more often (for both negative posts and positive posts). This can be quickly and easily estimated in CausalNLP.

-

Causal Inference from Text with Autocoders

Let's first use the Autocoder to transform the raw text into sentiment. We can then control for sentiment when estimating the causal effect.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp.autocoder import Autocoder
-ac = Autocoder()
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=False, batch_size=16)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?negativepositive
00I really love my job!00.0191910.980809
10I really love my job!00.0191910.980809
20I really love my job!00.0191910.980809
30I really love my job!00.0191910.980809
40I really love my job!00.0191910.980809
-
-
- -
- -
-
- -
- {% endraw %} - -
-
-

When autocoding the raw text for sentiment, we have chosen to use the raw "probabilities" with binarize=False. A binary variable can also be used with binarize=True.

-

Next, let's estimate the treatment effects. We will ignore the positive and Post_Shared? columns, as their information is captured by the negative column in this example. We will use the T-Learner. See this paper for more information on metalearner types.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp.causalinference import CausalInferenceModel
-cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='Is_Male?', outcome_col='Post_Shared?',
-                          include_cols=['negative'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: Is_Male?
-numerical/categorical covariates: ['negative']
-preprocess time:  0.012983322143554688  sec
-start fitting causal inference model
-time to fit causal inference model:  0.9569253921508789  sec
-
-
-
- -
- - - -
-
<causalnlp.causalinference.CausalInferenceModel at 0x7efd400dbd30>
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Upon controlling for sentiment, we see that the overall average treatment is correctly estimated as 0.05.

- -
-
-
- {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.05366850622769351}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Since this is a small, simulated, toy problem, we can manually calculate the adjusted treatment effect by controlling for the single counfounder (i.e., post negativity):

- -
-
-
- {% raw %} - -
-
- -
-
-
from collections import defaultdict
-def ATE_adjusted(C, T, Y):
-    x = defaultdict(list)
-    for c, t, y in zip(C, T, Y):
-        x[c, t].append(y)
-
-    C0_ATE = np.mean(x[0,1]) - np.mean(x[0,0])
-    C1_ATE = np.mean(x[1,1]) - np.mean(x[1,0])
-    return np.mean([C0_ATE, C1_ATE])
-ATE_adjusted((df['negative']>0.5).astype('int'), df['Is_Male?'].values, df['Post_Shared?'].values)
-
- -
-
-
- -
-
- -
- - - -
-
0.0534529194528211
-
- -
- -
-
- -
- {% endraw %} - -
-
-

We see that this value is close to our estimate.

-

CausalNLP allows you to easily compute conditional or individualized treatment effects. -For instance, for negative posts, being male increases the chance of your post being shared by about 4 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['negative']>0.9)
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.042535751074149745}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

For positive posts, being male increases the chance of your post being shared by about 6 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['negative']<0.1)
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.06436468274776497}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.05
-assert ate['ate'] < 0.055
-
- -
-
-
- -
- {% endraw %} - -
-
-

Predictions can be made for new observations. We just have to make sure it contains the relevant columns included in the DataFrame supplied to CausalInferenceModel.fit. In this case, it must include Is_Male? and negative. This can be verified with the CausalInferenceModel.get_required_columns method:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['Is_Male?', 'negative']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-     'text' : ['I love my life.'],
-    'Is_Male?' : [0],
-    'negative' : [0]
-      })
-effect = cm.predict(test_df)
-assert effect[0][0] < 0.065
-assert effect[0][0] > 0.064
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.06436468]]
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Causal Inference Using Raw Text as a Confounder/Mediator

In the example above, we approached the problem under the assumption that a specific lingustic property (sentiment) was an important mediator or confounder for which to control. In some cases, there may also be other unknown lingustic properties that are potential confounders/mediators (e.g., topic, politeness, toxic language, readability).

-

In CausalNLP, we can also use the raw text as the potential confounder/mediator.

- -
-
-
- {% raw %} - -
-
- -
-
-
cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='Is_Male?', outcome_col='Post_Shared?', text_col='Post_Text',
-                         ignore_cols=['negative', 'positive'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: Is_Male?
-numerical/categorical covariates: []
-text covariate: Post_Text
-preprocess time:  0.01604151725769043  sec
-start fitting causal inference model
-time to fit causal inference model:  0.08830595016479492  sec
-
-
-
- -
- - - -
-
<causalnlp.causalinference.CausalInferenceModel at 0x7efd400ea358>
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Although we have excluded the negative and positive columns as extra covariates, you can use traditional categorical/numerical covariates in combination with a text field covariate (if they exist as extra columns in the dataframe).

-

Here, we see that the same causal estimates are returned, as the text is easy to infer as positive or negative based on their correlations with the outcomes in this problem.

- -
-
-
- {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.05366850622769351}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.estimate_ate(df['Post_Text'] == 'My boss is pretty terrible.')
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.042535751074149745}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.estimate_ate(df['Post_Text'] == 'I really love my job!')
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.06436468274776497}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.05
-assert ate['ate'] < 0.055
-
- -
-
-
- -
- {% endraw %} - -
-
- -
Make predictions on new data.  Again, make sure the DataFrame contains the relevant columns included in the original DataFrame supplied to [`CausalInferenceModel.fit`](/causalnlp/causalinference.html#CausalInferenceModel.fit):
- -
-
-
- {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['Is_Male?', 'Post_Text']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-     'Post_Text' : ['I love my life.'],
-    'New Column' : [1],
-    'Is_Male?' : [0],
-    'negative' : [0]
-      })
-effect = cm.predict(test_df)
-assert effect[0][0] < 0.065
-assert effect[0][0] > 0.064
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.06436468]]
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=False, method='feature_importance')
-
- -
-
-
- -
-
- -
- - - -
-
{1: v_boss        1.0
- v_terrible    0.0
- v_pretty      0.0
- dtype: float64}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=True, method='feature_importance')
-
- -
-
-
- -
-
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=True, method='shap_values')
-
- -
-
-
- -
-
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Causal Inference With Text as a Treatment

-
-
-
-
-
-

Suppose we were interested in estimating the causal impact of sentiment on the outcome. That is, sentiment of text is the treatment, and the gender is a potential confounder. As we did above, we can use the Autocoder to create the treatment variable. The only difference is that we would supply the binarize=True as an argument.

- -
-
-
- {% raw %} - -
-
- -
-
-
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=True, batch_size=16)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?negativepositive
00I really love my job!001
10I really love my job!001
20I really love my job!001
30I really love my job!001
40I really love my job!001
-
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='positive', outcome_col='Post_Shared?',
-                          include_cols=['Is_Male?'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: positive
-numerical/categorical covariates: ['Is_Male?']
-preprocess time:  0.009029388427734375  sec
-start fitting causal inference model
-time to fit causal inference model:  0.6834802627563477  sec
-
-
-
- -
- - - -
-
<causalnlp.causalinference.CausalInferenceModel at 0x7efd2cd955f8>
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.19008080596986368}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.18
-assert ate['ate'] < 0.2
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['positive', 'Is_Male?']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-    'Is_Male?' : [1],
-    'positive' : [1]
-      })
-effect = cm.predict(test_df)
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.20099539]]
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

platt_scale[source]

platt_scale(outcome, probs)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

gelu[source]

gelu(x)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

make_bow_vector[source]

make_bow_vector(ids, vocab_size, use_counts=False)

-
-

Make a sparse BOW vector from a tensor of dense ids. -Args: - ids: torch.LongTensor [batch, features]. Dense tensor of ids. - vocab_size: vocab size for this tensor. - use_counts: if true, the outgoing BOW vector will contain - feature counts. If false, will contain binary indicators. -Returns: - The sparse bag-of-words representation of ids.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalBert[source]

CausalBert(config) :: DistilBertPreTrainedModel

-
-

CausalBert is essentially an S-Learner that uses a DistilBert sequence classification model as the base learner.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalBertModel[source]

CausalBertModel(g_weight=0.0, Q_weight=0.1, mlm_weight=1.0, batch_size=32, max_length=128)

-
-

CausalBertModel is a wrapper for CausalBert

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.train[source]

CausalBertModel.train(texts, confounds, treatments, outcomes, learning_rate=2e-05, epochs=3)

-
-

Trains a CausalBert model

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.estimate_ate[source]

CausalBertModel.estimate_ate(C, W, Y=None, platt_scaling=False)

-
-

Computes average treatment effect using the trained estimator

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.inference[source]

CausalBertModel.inference(texts, confounds, outcome=None)

-
-

Perform inference using the trained model

- -
- -
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/core.causalbert.html b/docs/core.causalbert.html deleted file mode 100644 index 2dc69cc..0000000 --- a/docs/core.causalbert.html +++ /dev/null @@ -1,314 +0,0 @@ ---- - -title: CausalBert - - -keywords: fastai -sidebar: home_sidebar - -summary: "CausalBert API" -description: "CausalBert API" -nb_path: "nbs/00b_core.causalbert.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

platt_scale[source]

platt_scale(outcome, probs)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

gelu[source]

gelu(x)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

make_bow_vector[source]

make_bow_vector(ids, vocab_size, use_counts=False)

-
-

Make a sparse BOW vector from a tensor of dense ids. -Args: - ids: torch.LongTensor [batch, features]. Dense tensor of ids. - vocab_size: vocab size for this tensor. - use_counts: if true, the outgoing BOW vector will contain - feature counts. If false, will contain binary indicators. -Returns: - The sparse bag-of-words representation of ids.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalBert[source]

CausalBert(config) :: DistilBertPreTrainedModel

-
-

CausalBert is essentially an S-Learner that uses a DistilBert sequence classification model as the base learner.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalBertModel[source]

CausalBertModel(g_weight=0.0, Q_weight=0.1, mlm_weight=1.0, batch_size=32, max_length=128, model_name='distilbert-base-uncased')

-
-

CausalBertModel is a wrapper for CausalBert

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.train[source]

CausalBertModel.train(texts, confounds, treatments, outcomes, learning_rate=2e-05, epochs=3)

-
-

Trains a CausalBert model

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.estimate_ate[source]

CausalBertModel.estimate_ate(C, W, Y=None, platt_scaling=False)

-
-

Computes average treatment effect using the trained estimator

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalBertModel.inference[source]

CausalBertModel.inference(texts, confounds, outcome=None)

-
-

Perform inference using the trained model

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Example

This implementation of CausalBert was adapted from Causal Effects of Linguistic Properties by Pryzant et al. CausalBert is essentially a kind of S-Learner that uses a DistilBert sequence classification model as the base learner.

- -
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)
-from causalnlp.core.causalbert import CausalBertModel
-cb = CausalBertModel(batch_size=32, max_length=128)
-cb.train(df['text'], df['C_true'], df['T_ac'], df['Y_sim'], epochs=1, learning_rate=2e-5)
-print(cb.estimate_ate(df['C_true'], df['text']))
-
- -
-
-
- -
-
- -
- -
-
Some weights of CausalBert were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['Q_cls.1.0.bias', 'Q_cls.0.0.bias', 'g_cls.weight', 'Q_cls.1.0.weight', 'g_cls.bias', 'Q_cls.1.2.bias', 'Q_cls.0.2.weight', 'Q_cls.0.0.weight', 'Q_cls.0.2.bias', 'Q_cls.1.2.weight']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-100%|██████████| 666/666 [02:12<00:00,  5.01it/s]
-100%|██████████| 666/666 [00:27<00:00, 24.32it/s]
-
-
- -
- -
-
0.17478953341997637
-
-
-
- -
- -
-
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

(Reduce the batch_size if you receive an Out-Of-Memory error when running the code above.)

- -
-
-
-
- - diff --git a/docs/core.causalinference.html b/docs/core.causalinference.html deleted file mode 100644 index c040e47..0000000 --- a/docs/core.causalinference.html +++ /dev/null @@ -1,1789 +0,0 @@ ---- - -title: Causal Inference - - -keywords: fastai -sidebar: home_sidebar - -summary: "Causal Inference API" -description: "Causal Inference API" -nb_path: "nbs/00a_core.causalinference.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class CausalInferenceModel[source]

CausalInferenceModel(df, method='t-learner', metalearner_type=None, treatment_col='treatment', outcome_col='outcome', text_col=None, ignore_cols=[], include_cols=[], treatment_effect_col='treatment_effect', learner=None, effect_learner=None, min_df=0.05, max_df=0.5, ngram_range=(1, 1), stop_words='english', verbose=1)

-
-

Infers causality from the data contained in df using a metalearner.

-

Usage:

-
>>> cm = CausalInferenceModel(df,
-                              treatment_col='Is_Male?',
-                              outcome_col='Post_Shared?', text_col='Post_Text',
-                              ignore_cols=['id', 'email'])
-    cm.fit()
-
-

Parameters:

-
    -
  • df : pandas.DataFrame containing dataset
  • -
  • method : metalearner model to use. One of {'t-learner', 's-learner', 'x-learner', 'r-learner'} (Default: 't-learner')
  • -
  • metalearner_type : Alias of method for backwards compatibility. Overrides method if not None.
  • -
  • treatment_col : treatment variable; column should contain binary values: 1 for treated, 0 for untreated.
  • -
  • outcome_col : outcome variable; column should contain the categorical or numeric outcome values
  • -
  • text_col : (optional) text column containing the strings (e.g., articles, reviews, emails).
  • -
  • ignore_cols : columns to ignore in the analysis
  • -
  • include_cols : columns to include as covariates (e.g., possible confounders)
  • -
  • treatment_effect_col : name of column to hold causal effect estimations. Does not need to exist. Created by CausalNLP.
  • -
  • learner : an instance of a custom learner. If None, Log/Lin Regression is used for S-Learner -
              and a default LightGBM model will be used for all other metalearner types.
    -
    - # Example - learner = LGBMClassifier(num_leaves=1000)
  • -
  • effect_learner: used for x-learner/r-learner and must be regression model
  • -
  • min_df : min_df parameter used for text processing using sklearn
  • -
  • max_df : max_df parameter used for text procesing using sklearn
  • -
  • ngram_range: ngrams used for text vectorization. default: (1,1)
  • -
  • stop_words : stop words used for text processing (from sklearn)
  • -
  • verbose : If 1, print informational messages. If 0, suppress.
  • -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TypeDefaultDetails
dfNo Content
methodstrt-learnerNo Content
metalearner_typeNonealias for method
treatment_colstrtreatmentNo Content
outcome_colstroutcomeNo Content
text_colNoneNo Content
ignore_colslistNoneNo Content
include_colslistNoneNo Content
treatment_effect_colstrtreatment_effectNo Content
learnerNoneNo Content
effect_learnerNoneNo Content
min_dffloat0.05No Content
max_dffloat0.5No Content
ngram_rangetuple(1, 1)No Content
stop_wordsstrenglishNo Content
verboseint1No Content
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.fit[source]

CausalInferenceModel.fit(p=None)

-
-

Fits a causal inference model and estimates outcome -with and without treatment for each observation. -For X-Learner and R-Learner, propensity scores will be computed -using default propensity model unless p is not None. -Parameter p is not used for other methods.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.tune_and_use_default_learner[source]

CausalInferenceModel.tune_and_use_default_learner(split_pct=0.2, random_state=314, scoring=None)

-
-

Tunes the hyperparameters of a default LightGBM model, replaces CausalInferenceModel.learner, -and returns best parameters. -Should be invoked prior to running CausalInferencemodel.fit. -If scoring is None, then 'roc_auc' is used for classification and 'negative_mean_squared_error' -is used for regresssion.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.predict[source]

CausalInferenceModel.predict(df, p=None)

-
-

Estimates the treatment effect for each observation in df. -The DataFrame represented by df should be the same format -as the one supplied to CausalInferenceModel.__init__. -For X-Learner and R-Learner, propensity scores will be computed -using default propensity model unless p is not None. -Parameter p is not used for other methods.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.get_required_columns[source]

CausalInferenceModel.get_required_columns()

-
-

Returns required columns that must exist in any DataFrame supplied to CausalInferenceModel.predict.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.estimate_ate[source]

CausalInferenceModel.estimate_ate(bool_mask=None)

-
-

Estimates the treatment effect for each observation in -self.df.

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

The bool_mask parameter can be used to estimate the conditional average treatment estimate (CATE). -For instance, to estimate the average treatment effect for only those individuals over 18 years of age:

-
cm.estimate_ate(cm.df['age']>18])
-
- -
-
-
- {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.evaluate_robustness[source]

CausalInferenceModel.evaluate_robustness(sample_size=0.8)

-
-

Evaluates robustness on four sensitivity measures (see CausalML package for details on these methods):

-
    -
  • Placebo Treatment: ATE should become zero.
  • -
  • Random Cause: ATE should not change.
  • -
  • Random Replacement: ATE should not change.
  • -
  • Subset Data: ATE should not change.
  • -
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.interpret[source]

CausalInferenceModel.interpret(plot=False, method='feature_importance')

-
-

Returns feature importances of treatment effect model. -The method parameter must be one of {'feature_importance', 'shap_values'}

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

CausalInferenceModel.explain[source]

CausalInferenceModel.explain(df, row_index=None, row_num=0, background_size=50, nsamples=500)

-
-

Explain the treatment effect estimate of a single observation using SHAP.

-

Parameters:

-
    -
  • df (pd.DataFrame): a pd.DataFrame of test data is same format as original training data DataFrame
  • -
  • row_num (int): raw row number in DataFrame to explain (default:0, the first row)
  • -
  • background_size (int): size of background data (SHAP parameter)
  • -
  • nsamples (int): number of samples (SHAP parameter)
  • -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Usage Example: Do social media posts by women get shared more often than those by men?

Let's create a simulated dataset.

- -
-
-
- {% raw %} - -
-
- -
-
-
import itertools
-import pandas as pd
-data = ((*a, b) for (a, b) in zip(itertools.product([0,1], [0,1], [0,1]), [36, 234, 25, 55, 6, 81, 71, 192]))
-df = pd.DataFrame(data, columns=['Is_Male?', 'Post_Text', 'Post_Shared?', 'N'])
-df = df.loc[df.index.repeat(df['N'])].reset_index(drop=True).drop(columns=['N'])
-values = sorted(df['Post_Text'].unique())
-df['Post_Text'].replace(values, ['I really love my job!', 'My boss is pretty terrible.'], inplace=True)
-original_df = df.copy()
-df = None
-original_df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?
00I really love my job!0
10I really love my job!0
20I really love my job!0
30I really love my job!0
40I really love my job!0
-
- -
- -
-
- -
- {% endraw %} - -
-
-

At first glance, it seems like posts by women get shared more often. More specifically, it appears that being male reduces your the chance your post is shared by 4.5 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
male_probability = original_df[(original_df['Is_Male?']==1)]['Post_Shared?'].value_counts(normalize=True)[1]
-male_probability
-
- -
-
-
- -
-
- -
- - - -
-
0.78
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
female_probability = original_df[(original_df['Is_Male?']==0)]['Post_Shared?'].value_counts(normalize=True)[1]
-female_probability
-
- -
-
-
- -
-
- -
- - - -
-
0.8257142857142857
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
male_probability-female_probability
-
- -
-
-
- -
-
- -
- - - -
-
-0.04571428571428571
-
- -
- -
-
- -
- {% endraw %} - -
-
-

However, this is inaccurate. In fact, this is an example of Simpson's Paradox, and the true causal effect of being male in this simulated datsaet is roughly 0.05 (as opposed to -0.045) with men's posts being more likely to be shared. The reason is that women in this simulation tend to make more positive posts which tend to be shared more often here. Post sentiment, then, is a mediator, which is statistically statistically similar to a confounder.

-

When controlling for the sentiment of the post (the mediator variable in this dataset), it is revealed that men's posts are, in fact, shared more often (for both negative posts and positive posts). This can be quickly and easily estimated in CausalNLP.

-

Causal Inference from Text with Autocoders

Let's first use the Autocoder to transform the raw text into sentiment. We can then control for sentiment when estimating the causal effect.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp.autocoder import Autocoder
-ac = Autocoder()
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=False, batch_size=16)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?negativepositive
00I really love my job!00.0191910.980809
10I really love my job!00.0191910.980809
20I really love my job!00.0191910.980809
30I really love my job!00.0191910.980809
40I really love my job!00.0191910.980809
-
- -
- -
-
- -
- {% endraw %} - -
-
-

When autocoding the raw text for sentiment, we have chosen to use the raw "probabilities" with binarize=False. A binary variable can also be used with binarize=True.

-

Next, let's estimate the treatment effects. We will ignore the positive and Post_Shared? columns, as their information is captured by the negative column in this example. We will use the T-Learner. See this paper for more information on metalearner types.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp import CausalInferenceModel
-cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='Is_Male?', outcome_col='Post_Shared?',
-                          include_cols=['negative'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: Is_Male?
-numerical/categorical covariates: ['negative']
-preprocess time:  0.013550996780395508  sec
-start fitting causal inference model
-time to fit causal inference model:  0.8901166915893555  sec
-
-
-
- -
- - - -
-
<causalnlp.core.causalinference.CausalInferenceModel at 0x7fca0c2040f0>
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Upon controlling for sentiment, we see that the overall average treatment is correctly estimated as 0.05.

- -
-
-
- {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.05366850622769351}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Since this is a small, simulated, toy problem, we can manually calculate the adjusted treatment effect by controlling for the single counfounder (i.e., post negativity):

- -
-
-
- {% raw %} - -
-
- -
-
-
from collections import defaultdict
-def ATE_adjusted(C, T, Y):
-    x = defaultdict(list)
-    for c, t, y in zip(C, T, Y):
-        x[c, t].append(y)
-
-    C0_ATE = np.mean(x[0,1]) - np.mean(x[0,0])
-    C1_ATE = np.mean(x[1,1]) - np.mean(x[1,0])
-    return np.mean([C0_ATE, C1_ATE])
-ATE_adjusted((df['negative']>0.5).astype('int'), df['Is_Male?'].values, df['Post_Shared?'].values)
-
- -
-
-
- -
-
- -
- - - -
-
0.0534529194528211
-
- -
- -
-
- -
- {% endraw %} - -
-
-

We see that this value is close to our estimate.

-

CausalNLP allows you to easily compute conditional or individualized treatment effects. -For instance, for negative posts, being male increases the chance of your post being shared by about 4 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['negative']>0.9)
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.042535751074149745}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

For positive posts, being male increases the chance of your post being shared by about 6 percentage points:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['negative']<0.1)
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.06436468274776497}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.05
-assert ate['ate'] < 0.055
-
- -
-
-
- -
- {% endraw %} - -
-
-

Predictions can be made for new observations. We just have to make sure it contains the relevant columns included in the DataFrame supplied to CausalInferenceModel.fit. In this case, it must include Is_Male? and negative. This can be verified with the CausalInferenceModel.get_required_columns method:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['Is_Male?', 'negative']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-     'text' : ['I love my life.'],
-    'Is_Male?' : [0],
-    'negative' : [0]
-      })
-effect = cm.predict(test_df)
-assert effect[0][0] < 0.065
-assert effect[0][0] > 0.064
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.06436468]]
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Causal Inference Using Raw Text as a Confounder/Mediator

In the example above, we approached the problem under the assumption that a specific lingustic property (sentiment) was an important mediator or confounder for which to control. In some cases, there may also be other unknown lingustic properties that are potential confounders/mediators (e.g., topic, politeness, toxic language, readability).

-

In CausalNLP, we can also use the raw text as the potential confounder/mediator.

- -
-
-
- {% raw %} - -
-
- -
-
-
cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='Is_Male?', outcome_col='Post_Shared?', text_col='Post_Text',
-                         ignore_cols=['negative', 'positive'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: Is_Male?
-numerical/categorical covariates: []
-text covariate: Post_Text
-preprocess time:  0.015369415283203125  sec
-start fitting causal inference model
-time to fit causal inference model:  0.5458502769470215  sec
-
-
-
- -
- - - -
-
<causalnlp.core.causalinference.CausalInferenceModel at 0x7fca0d1921d0>
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Although we have excluded the negative and positive columns as extra covariates, you can use traditional categorical/numerical covariates in combination with a text field covariate (if they exist as extra columns in the dataframe).

-

Here, we see that the same causal estimates are returned, as the text is easy to infer as positive or negative based on their correlations with the outcomes in this problem.

- -
-
-
- {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.05366850622769351}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.estimate_ate(df['Post_Text'] == 'My boss is pretty terrible.')
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.042535751074149745}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.estimate_ate(df['Post_Text'] == 'I really love my job!')
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.06436468274776497}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.05
-assert ate['ate'] < 0.055
-
- -
-
-
- -
- {% endraw %} - -
-
- -
Make predictions on new data.  Again, make sure the DataFrame contains the relevant columns included in the original DataFrame supplied to [`CausalInferenceModel.fit`](/causalnlp/core.causalinference.html#CausalInferenceModel.fit):
- -
-
-
- {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['Is_Male?', 'Post_Text']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-     'Post_Text' : ['I love my life.'],
-    'New Column' : [1],
-    'Is_Male?' : [0],
-    'negative' : [0]
-      })
-effect = cm.predict(test_df)
-assert effect[0][0] < 0.065
-assert effect[0][0] > 0.064
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.06436468]]
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=False, method='feature_importance')
-
- -
-
-
- -
-
- -
- - - -
-
{1: v_boss        1.0
- v_terrible    0.0
- v_pretty      0.0
- dtype: float64}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=True, method='feature_importance')
-
- -
-
-
- -
-
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.interpret(plot=True, method='shap_values')
-
- -
-
-
- -
-
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Causal Inference With Text as a Treatment

-
-
-
-
-
-

Suppose we were interested in estimating the causal impact of sentiment on the outcome. That is, sentiment of text is the treatment, and the gender is a potential confounder. As we did above, we can use the Autocoder to create the treatment variable. The only difference is that we would supply the binarize=True as an argument.

- -
-
-
- {% raw %} - -
-
- -
-
-
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=True, batch_size=16)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Is_Male?Post_TextPost_Shared?negativepositive
00I really love my job!001
10I really love my job!001
20I really love my job!001
30I really love my job!001
40I really love my job!001
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm = CausalInferenceModel(df, method='t-learner',
-                          treatment_col='positive', outcome_col='Post_Shared?',
-                          include_cols=['Is_Male?'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Post_Shared?
-treatment column: positive
-numerical/categorical covariates: ['Is_Male?']
-preprocess time:  0.008125543594360352  sec
-start fitting causal inference model
-time to fit causal inference model:  0.5112130641937256  sec
-
-
-
- -
- - - -
-
<causalnlp.core.causalinference.CausalInferenceModel at 0x7fca0ca17f98>
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
ate = cm.estimate_ate()
-ate
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.19008080596986368}
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert ate['ate'] > 0.18
-assert ate['ate'] < 0.2
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['positive', 'Is_Male?']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-    'Is_Male?' : [1],
-    'positive' : [1]
-      })
-effect = cm.predict(test_df)
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.20099539]]
-
-
-
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/examples.html b/docs/examples.html deleted file mode 100644 index af0a1bf..0000000 --- a/docs/examples.html +++ /dev/null @@ -1,1250 +0,0 @@ ---- - -title: Examples - - -keywords: fastai -sidebar: home_sidebar - -summary: "Various examples of CausalNLP on semi-simulated or real datasets." -description: "Various examples of CausalNLP on semi-simulated or real datasets." -nb_path: "nbs/99_examples.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
from causalnlp import CausalInferenceModel
-from causalnlp import Autocoder
-
- -
-
-
- -
- {% endraw %} - -
-
-

What is the causal impact of a positive review on product views?

We use a semi-simulated dataset generated from this repo, which is available in the sample_data folder. The reviews and product types are real, while the outcomes (e.g., 1=product clicked, 0=not clicked) are simulated.

- -
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
indexidratingproducttextsummarypriceT_trueC_trueY_simnegativepositiveT_ac
0700013887031.0mp3 musicbuy the cd. do not buy the mp3 album. downlo...Buy the CD. Do not buy the MP3.13.010000.5487330.4512670
1800013887035.0mp3 musictakes me back to my childhood!Love it!13.011000.0083730.9916271
21200013887035.0audio cdthe passion and ingenuity of green's music is ...No one like Keith Green13.011110.0437610.9562391
31300013887035.0mp3 musickeith's music is a timeless message. since hi...Never Gets Old13.011010.0388760.9611241
41500013776475.0audio cdi have fallen in love with john michael talbot...Talbot a masterpiece18.991110.0198280.9801721
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Y_sim is the simulated outcome indicating whether or not the product was clicked. C_true is a categorical variable, where 1 is an audio CD and and 0 is something else (e.g., MP3). In this dataset, outcomes were simulated such that C_true is a counfounding variable for this problem.

- -
-
-
-
-
-

The treatment is whether or not the review is positive, which affects Y_sim. Let's pretend we don't have a rating and need to infer this from text using the Autocoder. This can be done with:

-
ac = Autocoder()
-df = ac.code_sentiment(df['text'].values, df, batch_size=16, binarize=True)
-df['T_ac'] = df['positive']
-
-

We've already created this as the T_ac column (along with the positive and negative columns), so invoking the above is not needed. Note that T_ac is an imperfect approximation of T_true. In CausalNLP, we can include the raw text as covariates to improve our estimates.

-

Let's fit the causal inference model. We will adjust for both C_true and the raw text of the review to minimize bias from confounding. CausalNLP supports the following metalearners: S-Learner, T-Learner, X-Learner, and R-Learner. See this paper for more information on these. We will use the T-Learner as the metalearner here. By default, T-Learners use LightGBM classifiers with 31 leaves. Let's increase the number of leaves to 500. In practice, you can supply a learner with hyperparameters that you've tuned beforehand to accurately predict the outcome.

- -
-
-
- {% raw %} - -
-
- -
-
-
from lightgbm import LGBMClassifier
-from sklearn.linear_model import LogisticRegression, LinearRegression
-cm = CausalInferenceModel(df, method='t-learner',
-                    learner=LGBMClassifier(num_leaves=500),
-                    treatment_col='T_ac', 
-                    outcome_col='Y_sim', 
-                    text_col='text',
-                    include_cols=['C_true'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Y_sim
-treatment column: T_ac
-numerical/categorical covariates: ['C_true']
-text covariate: text
-preprocess time:  1.118110179901123  sec
-start fitting causal inference model
-time to fit causal inference model:  10.667636632919312  sec
-
-
-
- -
- - - -
-
<causalnlp.causalinference.CausalInferenceModel at 0x7f079361b0f0>
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Average Treatment Effect (ATE)

We can calculate the overall average treatment effect (ATE) as follows:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate()
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.1309311542209525}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

The overall ATE is an increase of 13 percentage points in probability.

-

Unlike machine learning, there is no ground truth to which our estimate can be compared for causal inference on real-world datasets. Hoewver, since this is a simulated dataset, we can compare our estimate with the ground truth ATE of 0.1479 (14.79 percentage point change in outcome), and our estimate is close.

- -
-
-
- {% raw %} - -
-
- -
-
-
from collections import defaultdict
-import numpy as np
-def ATE_adjusted(C, T, Y):
-    x = defaultdict(list)
-    for c, t, y in zip(C, T, Y):
-        x[c, t].append(y)
-
-    C0_ATE = np.mean(x[0,1]) - np.mean(x[0,0])
-    C1_ATE =  np.mean(x[1,1]) - np.mean(x[1,0])
-    return np.mean([C0_ATE, C1_ATE])
-print(ATE_adjusted(df.C_true, df.T_true, df.Y_sim))
-
- -
-
-
- -
-
- -
- -
-
0.14785542719890196
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Such oracle estimates are not available for real-world datsets, as mentioned. For real-world scenarios, we can, at least, evaluate the robustness of the ATE estimate to various data manipuations (i.e., sensitivity analysis or refutation).

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.evaluate_robustness()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MethodATENew ATENew ATE LBNew ATE UBDistance from Desired (should be near 0)
0Placebo Treatment0.1309310.00477642-0.004527050.01407990.00477642
0Random Cause0.1309310.1311220.1221960.1400490.000191267
0Subset Data(sample size @0.8)0.1309310.1293830.1172390.141528-0.0015477
0Random Replace0.1309310.1301960.1212090.139184-0.000734766
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Here, we see the distance from the desired value is near zero for each sensitivy analysis method , which is good.

- -
-
-
-
-
-

Conditional Average Treatment Effect (CATE)

We can also calculate the conditional average treatment effects (CATE). For instance, here is the treatment effect for those reviews that mention the word ``toddler.''

- -
-
-
- {% raw %} - -
-
- -
-
-
series = df['text']
-cm.estimate_ate(df['text'].str.contains('toddler'))
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.15559234254638685}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Individualized Treatment Effect (ITE)

We can easily predict the treatment effect for new or existing observations on a per-unit basis. We just need to make sure the DataFrame supplied as input to CausalInferenceModel.predict contains the right columns. This can easily be checked with CausalInferenceModel.get_required_columns:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.get_required_columns()
-
- -
-
-
- -
-
- -
- - - -
-
['T_ac', 'C_true', 'text']
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-    'T_ac' : [1],
-    'C_true' : [1],
-    'text' : ['I love the music of Zamfir and his pan flute.']
-      })
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.predict(test_df)
-
- -
-
-
- -
-
- -
- - - -
-
array([[0.40062776]])
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Model Interpetability

We can use the interpret method to identify the attributes most predictive of individualized treatment effects across observations. Features begnning with v_ are word (or vocabulary) features. We see that words like "music", "cd", and "love" in addition to the categorical attribute C_true (the known confounder which is 1 for audio CDs) are most predictive of individualized causal effects.

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.interpret(plot=False, method='feature_importance')[1][:10]
-
- -
-
-
- -
-
- -
- - - -
-
v_music    0.079042
-v_cd       0.066838
-v_album    0.055168
-v_like     0.040784
-v_love     0.040635
-C_true     0.039949
-v_just     0.035671
-v_song     0.035362
-v_great    0.029918
-v_heard    0.028373
-dtype: float64
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
cm.explain(test_df, row_num=0)
-
- -
-
-
- -
-
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

What is the causal impact of having a PhD on making over $50K?

Text is Optional in CausalNLP

-
-

Despite the "NLP" in the name, CausalNLP can be used for causal analyses on traditional tabular datasets with no text fields.

-

Note:This dataset is from the early to mid 1990s, and we are using it as a toy dataset for demonstration purposes only.

- -
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/adult-census.csv')
-df = df.rename(columns=lambda x: x.strip())
-df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x) 
-filter_set = 'Doctorate'
-df['treatment'] = df['education'].apply(lambda x: 1 if x in filter_set else 0)
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ageworkclassfnlwgteducationeducation-nummarital-statusoccupationrelationshipracesexcapital-gaincapital-losshours-per-weeknative-countryclasstreatment
025Private178478Bachelors13Never-marriedTech-supportOwn-childWhiteFemale0040United-States<=50K0
123State-gov617435th-6th3Never-marriedTransport-movingNot-in-familyWhiteMale0035United-States<=50K0
246Private376789HS-grad9Never-marriedOther-serviceNot-in-familyWhiteMale0015United-States<=50K0
355?200235HS-grad9Married-civ-spouse?HusbandWhiteMale0050United-States>50K0
436Private2245417th-8th4Married-civ-spouseHandlers-cleanersHusbandWhiteMale0040El-Salvador<=50K0
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
from causalnlp import CausalInferenceModel
-cm = CausalInferenceModel(df, method='t-learner',
-                   treatment_col='treatment', 
-                   outcome_col='class',
-                   ignore_cols=['fnlwgt', 'education','education-num']).fit()
-
- -
-
-
- -
-
- -
- -
-
replaced ['<=50K', '>50K'] in column "class" with [0, 1]
-outcome column (categorical): class
-treatment column: treatment
-numerical/categorical covariates: ['age', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
-preprocess time:  0.4857158660888672  sec
-start fitting causal inference model
-time to fit causal inference model:  5.035430908203125  sec
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Overall, the average treatment effect of having a PhD is an increase of 20 percentage points in the probability of making over $50K (with respect to this model and dataset):

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate()
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.20340645077516034}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

For those who have a Master's degree:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['education'] == 'Masters')
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.17672418257642838}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

For those who are high school dropouts:

- -
-
-
- {% raw %} - -
-
- -
-
-
cm.estimate_ate(cm.df['education'].isin(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '12th']))
-
- -
-
-
- -
-
- -
- - - -
-
{'ate': 0.2586697863578173}
-
- -
- -
-
- -
- {% endraw %} - -
-
-

What is the causal impact of a job training program on earnings?

This is another example of causal inference on purely tabular data (no text). Here, we will use the famous LaLonde dataset from a job training study.

- -
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/lalonde.csv')
-df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
IDtreatageeducblackhispanmarriednodegreere74re75re78
0NSW11371110110.00.09930.0460
1NSW2122901010.00.03595.8940
2NSW31301210000.00.024909.4500
3NSW41271110010.00.07506.1460
4NSW5133810010.00.0289.7899
-
- -
- -
-
- -
- {% endraw %} - -
-
-

Unlike other meta-learners that use LightGBM as a default, the S-Learner uses Linear Regression as the default base learner for regression problems, which is a model that is often used for this dataset. The ATE estimate is $1548, which indicates that the job training program had an overall positive effect.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp import CausalInferenceModel
-cm = CausalInferenceModel(df, method='s-learner',
-                   treatment_col='treat', 
-                   outcome_col='re78',
-                   include_cols=['age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75'])
-cm.fit()
-print(cm.estimate_ate()) # ATE estimate = $1548
-
- -
-
-
- -
-
- -
- -
-
outcome column (numerical): re78
-treatment column: treat
-numerical/categorical covariates: ['age', 'educ', 'black', 'hispan', 'married', 'nodegree', 're74', 're75']
-preprocess time:  0.017691612243652344  sec
-start fitting causal inference model
-time to fit causal inference model:  0.0024728775024414062  sec
-{'ate': 1548.2438019996084}
-
-
-
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/feed.xml b/docs/feed.xml deleted file mode 100644 index d8d6ac9..0000000 --- a/docs/feed.xml +++ /dev/null @@ -1,32 +0,0 @@ ---- -search: exclude -layout: none ---- - - - - - {{ site.title | xml_escape }} - {{ site.description | xml_escape }} - {{ site.url }}/ - - {{ site.time | date_to_rfc822 }} - {{ site.time | date_to_rfc822 }} - Jekyll v{{ jekyll.version }} - {% for post in site.posts limit:10 %} - - {{ post.title | xml_escape }} - {{ post.content | xml_escape }} - {{ post.date | date_to_rfc822 }} - {{ post.url | prepend: site.url }} - {{ post.url | prepend: site.url }} - {% for tag in post.tags %} - {{ tag | xml_escape }} - {% endfor %} - {% for tag in page.tags %} - {{ cat | xml_escape }} - {% endfor %} - - {% endfor %} - - diff --git a/docs/images/output_14_0.png b/docs/images/output_14_0.png deleted file mode 100644 index 0dc4c15..0000000 Binary files a/docs/images/output_14_0.png and /dev/null differ diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index 1a0bf78..0000000 --- a/docs/index.html +++ /dev/null @@ -1,371 +0,0 @@ ---- - -title: Welcome to CausalNLP - - -keywords: fastai -sidebar: home_sidebar - - - -nb_path: "nbs/index.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - -
-
-

What is CausalNLP?

CausalNLP is a practical toolkit for causal inference with text as treatment, outcome, or "controlled-for" variable.

-
- -
-
-
-
-
-

Features

- -
-
-
-
-
-

Install

-
-
-
-
-
-
    -
  1. pip install -U pip
  2. -
  3. pip install causalnlp
  4. -
-

NOTE: On Python 3.6.x, if you get a RuntimeError: Python version >= 3.7 required, try ensuring NumPy is installed before CausalNLP (e.g., pip install numpy==1.18.5).

- -
-
-
-
-
-

Usage

-
-
-
-
-
-

To try out the examples yourself:

-

Open In Colab

- -
-
-
-
-
-

Example: What is the causal impact of a positive review on a product click?

-
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)
-
- -
-
-
- -
- {% endraw %} - -
-
-

The file music_seed50.tsv is a semi-simulated dataset from here. Columns of relevance include:

-
    -
  • Y_sim: outcome, where 1 means product was clicked and 0 means not.
  • -
  • text: raw text of review
  • -
  • rating: rating associated with review (1 through 5)
  • -
  • T_true: 0 means rating less than 3, 1 means rating of 5, where T_true affects the outcome Y_sim.
  • -
  • T_ac: an approximation of true review sentiment (T_true) created with Autocoder from raw review text
  • -
  • C_true:confounding categorical variable (1=audio CD, 0=other)
  • -
-

We'll pretend the true sentiment (i.e., review rating and T_true) is hidden and only use T_ac as the treatment variable.

-

Using the text_col parameter, we include the raw review text as another "controlled-for" variable.

- -
-
-
- {% raw %} - -
-
- -
-
-
from causalnlp import CausalInferenceModel
-from lightgbm import LGBMClassifier
-cm = CausalInferenceModel(df, 
-                         metalearner_type='t-learner', learner=LGBMClassifier(num_leaves=500),
-                         treatment_col='T_ac', outcome_col='Y_sim', text_col='text',
-                         include_cols=['C_true'])
-cm.fit()
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Y_sim
-treatment column: T_ac
-numerical/categorical covariates: ['C_true']
-text covariate: text
-preprocess time:  1.1179866790771484  sec
-start fitting causal inference model
-time to fit causal inference model:  10.361494302749634  sec
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Estimating Treatment Effects

CausalNLP supports estimation of heterogeneous treatment effects (i.e., how causal impacts vary across observations, which could be documents, emails, posts, individuals, or organizations).

-

We will first calculate the overall average treatment effect (or ATE), which shows that a positive review increases the probability of a click by 13 percentage points in this dataset.

-

Average Treatment Effect (or ATE):

- -
-
-
- {% raw %} - -
-
- -
-
-
print( cm.estimate_ate() )
-
- -
-
-
- -
-
- -
- -
-
{'ate': 0.1309311542209525}
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Conditional Average Treatment Effect (or CATE): reviews that mention the word "toddler":

- -
-
-
- {% raw %} - -
-
- -
-
-
print( cm.estimate_ate(df['text'].str.contains('toddler')) )
-
- -
-
-
- -
-
- -
- -
-
{'ate': 0.15559234254638685}
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Individualized Treatment Effects (or ITE):

- -
-
-
- {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({'T_ac' : [1], 'C_true' : [1], 
-                        'text' : ['I never bought this album, but I love his music and will soon!']})
-effect = cm.predict(test_df)
-print(effect)
-
- -
-
-
- -
-
- -
- -
-
[[0.80538201]]
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Model Interpretability:

- -
-
-
- {% raw %} - -
-
- -
-
-
print( cm.interpret(plot=False)[1][:10] )
-
- -
-
-
- -
-
- -
- -
-
v_music    0.079042
-v_cd       0.066838
-v_album    0.055168
-v_like     0.040784
-v_love     0.040635
-C_true     0.039949
-v_just     0.035671
-v_song     0.035362
-v_great    0.029918
-v_heard    0.028373
-dtype: float64
-
-
-
- -
-
- -
- {% endraw %} - -
-
-

Features with the v_ prefix are word features. C_true is the categorical variable indicating whether or not the product is a CD.

-

Text is Optional in CausalNLP

Despite the "NLP" in CausalNLP, the library can be used for causal inference on data without text (e.g., only numerical and categorical variables). See the examples for more info.

- -
-
-
-
-
-

Documentation

API documentation and additional usage examples are available at: https://amaiya.github.io/causalnlp/

-

How to Cite

Please cite the following paper when using CausalNLP in your work:

- -
-
-
-
-
- -
@article{maiya2021causalnlp,
-    title={CausalNLP: A Practical Toolkit for Causal Inference with Text},
-    author={Arun S. Maiya},
-    year={2021},
-    eprint={2106.08043},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL},
-    journal={arXiv preprint arXiv:2106.08043},
-}
- -
-
-
-
- - diff --git a/docs/key_driver_analysis.html b/docs/key_driver_analysis.html deleted file mode 100644 index dc1f4b1..0000000 --- a/docs/key_driver_analysis.html +++ /dev/null @@ -1,487 +0,0 @@ ---- - -title: Key Driver Analysis - - -keywords: fastai -sidebar: home_sidebar - -summary: "Key driver analysis to yield clues into **potential** causal relationships in your data by determining variables with high predictive power, high correlation with outcome, etc." -description: "Key driver analysis to yield clues into **potential** causal relationships in your data by determining variables with high predictive power, high correlation with outcome, etc." -nb_path: "nbs/03_key_driver_analysis.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class KeyDriverAnalysis[source]

KeyDriverAnalysis(df, outcome_col='outcome', text_col=None, include_cols=[], ignore_cols=[], verbose=1)

-
-

Performs key driver analysis

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

KeyDriverAnalysis.correlations[source]

KeyDriverAnalysis.correlations(outcome_only=True)

-
-

Computes corelations between independent variables and outcome

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/houses.csv')
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
from causalnlp.key_driver_analysis import KeyDriverAnalysis
-kda = KeyDriverAnalysis(df, outcome_col='SalePrice', ignore_cols=['Id', 'YearSold'])
-
- -
-
-
- -
-
- -
- -
-
outcome column (numerical): SalePrice
-treatment column: CausalNLP_temp_treatment
-numerical/categorical covariates: ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
-preprocess time:  0.3556947708129883  sec
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df_results = kda.correlations()
-df_results.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SalePrice
OverallQual0.790982
GrLivArea0.708624
GarageCars0.640409
GarageArea0.623431
TotalBsmtSF0.613581
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert df_results.iloc[[0]].index.values[0] == 'OverallQual'
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

KeyDriverAnalysis.importances[source]

KeyDriverAnalysis.importances(plot=True, split_pct=0.2, use_shap=False, shap_background_size=50, rf_model=None, n_estimators=100, n_jobs=-1, random_state=42)

-
-

Identifies important predictors using a RandomForest model.

- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Example: Variable Importances for Housing Prices

-
-
-
- {% raw %} - -
-
- -
-
-
df_results = kda.importances()
-df_results.head()
-
- -
-
-
- -
-
- -
- -
-
R^2 Training Score: 0.98 
-OOB Score: 0.85 
-R^2 Validation Score: 0.89
-
-
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DriverImportance
3OverallQual0.557707
15GrLivArea0.121145
11TotalBsmtSF0.035977
132ndFlrSF0.033758
8BsmtFinSF10.028563
-
- -
- -
- - - -
- -
- -
- -
-
- -
- {% endraw %} - -
-
-

Example: Variable Importances for Probability of Making Over $50K

-
-
-
- {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/adult-census.csv')
-kda = KeyDriverAnalysis(df, outcome_col='class', ignore_cols=['fnlwgt'])
-df_results = kda.importances(use_shap=True, plot=True)
-df_results.head()
-
- -
-
-
- -
-
- -
- -
-
replaced ['<=50K', '>50K'] in column "class" with [0, 1]
-outcome column (categorical): class
-treatment column: CausalNLP_temp_treatment
-numerical/categorical covariates: ['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
-preprocess time:  0.5094420909881592  sec
-R^2 Training Score: 0.98 
-OOB Score: 0.85 
-R^2 Validation Score: 0.85
-
-
-
- -
- - - -
- -
- -
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DriverImportance
2capital-gain0.102854
0age0.036508
1education-num0.035481
32marital-status_Married-civ-spouse0.031246
52relationship_Husband0.028451
-
- -
- -
-
- -
- {% endraw %} - -
- - diff --git a/docs/meta.base.html b/docs/meta.base.html deleted file mode 100644 index 492ab00..0000000 --- a/docs/meta.base.html +++ /dev/null @@ -1,65 +0,0 @@ ---- - -title: Base Metalearner - - -keywords: fastai -sidebar: home_sidebar - -summary: "Metalearner Base" -description: "Metalearner Base" -nb_path: "nbs/05a_meta.base.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseLearner[source]

BaseLearner()

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.explainer.html b/docs/meta.explainer.html deleted file mode 100644 index 4fa9513..0000000 --- a/docs/meta.explainer.html +++ /dev/null @@ -1,65 +0,0 @@ ---- - -title: Metalearner Explainer - - -keywords: fastai -sidebar: home_sidebar - -summary: "Metalearner Expainer" -description: "Metalearner Expainer" -nb_path: "nbs/05g_meta.explainer.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class Explainer[source]

Explainer(method, control_name, X, tau, classes, model_tau=None, features=None, normalize=True, test_size=0.3, random_state=None, override_checks=False, r_learners=None)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.propensity.html b/docs/meta.propensity.html deleted file mode 100644 index 45b7598..0000000 --- a/docs/meta.propensity.html +++ /dev/null @@ -1,236 +0,0 @@ ---- - -title: Metalearner Propensity - - -keywords: fastai -sidebar: home_sidebar - -summary: "Metalearner Propensity" -description: "Metalearner Propensity" -nb_path: "nbs/05h_meta.propensity.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class PropensityModel[source]

PropensityModel(clip_bounds=(0.001, 0.999), **model_kwargs)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class LogisticRegressionPropensityModel[source]

LogisticRegressionPropensityModel(clip_bounds=(0.001, 0.999), **model_kwargs) :: PropensityModel

-
-

Propensity regression model based on the LogisticRegression algorithm.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SimplePropensityModel[source]

SimplePropensityModel(clip_bounds=(0.001, 0.999), **model_kwargs) :: PropensityModel

-
-

Propensity regression model based on the LogisticRegression algorithm.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class ElasticNetPropensityModel[source]

ElasticNetPropensityModel(clip_bounds=(0.001, 0.999), **model_kwargs) :: LogisticRegressionPropensityModel

-
-

Propensity regression model based on the LogisticRegression algorithm.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class GradientBoostedPropensityModel[source]

GradientBoostedPropensityModel(early_stop=False, clip_bounds=(0.001, 0.999), **model_kwargs) :: PropensityModel

-
-

Gradient boosted propensity score model with optional early stopping.

-

Notes

Please see the xgboost documentation for more information on gradient boosting tuning parameters: -https://xgboost.readthedocs.io/en/latest/python/python_api.html

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

calibrate[source]

calibrate(ps, treatment)

-
-

Calibrate propensity scores with logistic GAM.

-

Ref: https://pygam.readthedocs.io/en/latest/api/logisticgam.html

-

Args: - ps (numpy.array): a propensity score vector - treatment (numpy.array): a binary treatment vector (0: control, 1: treated)

-

Returns: - (numpy.array): a calibrated propensity score vector

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

compute_propensity_score[source]

compute_propensity_score(X, treatment, p_model=None, X_pred=None, treatment_pred=None, calibrate_p=True)

-
-

Generate propensity score if user didn't provide

-

Args: - X (np.matrix): features for training - treatment (np.array or pd.Series): a treatment vector for training - p_model (propensity model object, optional): - ElasticNetPropensityModel (default) / GradientBoostedPropensityModel - X_pred (np.matrix, optional): features for prediction - treatment_pred (np.array or pd.Series, optional): a treatment vector for prediciton - calibrate_p (bool, optional): whether calibrate the propensity score

-

Returns: - (tuple)

- -
    - p (numpy.ndarray): propensity score
-    - p_model (PropensityModel): a trained PropensityModel object
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.rlearner.html b/docs/meta.rlearner.html deleted file mode 100644 index 20b3b39..0000000 --- a/docs/meta.rlearner.html +++ /dev/null @@ -1,152 +0,0 @@ ---- - -title: R-Learner - - -keywords: fastai -sidebar: home_sidebar - -summary: "R-Learner" -description: "R-Learner" -nb_path: "nbs/05e_meta.rlearner.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseRLearner[source]

BaseRLearner(learner=None, outcome_learner=None, effect_learner=None, propensity_learner=LogisticRegressionCV(Cs=array([1.00230524, 2.15608891, 4.63802765, 9.97700064]), - cv=StratifiedKFold(n_splits=4, random_state=42, shuffle=True), - l1_ratios=array([0.001 , 0.33366667, 0.66633333, 0.999 ]), - penalty='elasticnet', random_state=42, solver='saga'), ate_alpha=0.05, control_name=0, n_fold=5, random_state=None) :: BaseLearner

-
-

A parent class for R-learner classes.

-

An R-learner estimates treatment effects with two machine learning models and the propensity score.

-

Details of R-learner are available at Nie and Wager (2019) (https://arxiv.org/abs/1712.04912).

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseRRegressor[source]

BaseRRegressor(learner=None, outcome_learner=None, effect_learner=None, propensity_learner=LogisticRegressionCV(Cs=array([1.00230524, 2.15608891, 4.63802765, 9.97700064]), - cv=StratifiedKFold(n_splits=4, random_state=42, shuffle=True), - l1_ratios=array([0.001 , 0.33366667, 0.66633333, 0.999 ]), - penalty='elasticnet', random_state=42, solver='saga'), ate_alpha=0.05, control_name=0, n_fold=5, random_state=None) :: BaseRLearner

-
-

A parent class for R-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseRClassifier[source]

BaseRClassifier(outcome_learner=None, effect_learner=None, propensity_learner=LogisticRegressionCV(Cs=array([1.00230524, 2.15608891, 4.63802765, 9.97700064]), - cv=StratifiedKFold(n_splits=4, random_state=42, shuffle=True), - l1_ratios=array([0.001 , 0.33366667, 0.66633333, 0.999 ]), - penalty='elasticnet', random_state=42, solver='saga'), ate_alpha=0.05, control_name=0, n_fold=5, random_state=None) :: BaseRLearner

-
-

A parent class for R-learner classifier classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class XGBRRegressor[source]

XGBRRegressor(early_stopping=True, test_size=0.3, early_stopping_rounds=30, effect_learner_objective='rank:pairwise', effect_learner_n_estimators=500, random_state=42, *args, **kwargs) :: BaseRRegressor

-
-

A parent class for R-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.sensitivity.html b/docs/meta.sensitivity.html deleted file mode 100644 index f6122b4..0000000 --- a/docs/meta.sensitivity.html +++ /dev/null @@ -1,326 +0,0 @@ ---- - -title: Metalearner Sensitivity - - -keywords: fastai -sidebar: home_sidebar - -summary: "Metalearner Sensitivity" -description: "Metalearner Sensitivity" -nb_path: "nbs/05i_meta.sensitivity.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

one_sided[source]

one_sided(alpha, p, treatment)

-
-

One sided confounding function. -Reference: Blackwell, Matthew. "A selection bias approach to sensitivity analysis -for causal effects." Political Analysis 22.2 (2014): 169-182. -https://www.mattblackwell.org/files/papers/causalsens.pdf

-

Args: - alpha (np.array): a confounding values vector - p (np.array): a propensity score vector between 0 and 1 - treatment (np.array): a treatment vector (1 if treated, otherwise 0)

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

alignment[source]

alignment(alpha, p, treatment)

-
-

Alignment confounding function. -Reference: Blackwell, Matthew. "A selection bias approach to sensitivity analysis -for causal effects." Political Analysis 22.2 (2014): 169-182. -https://www.mattblackwell.org/files/papers/causalsens.pdf

-

Args: - alpha (np.array): a confounding values vector - p (np.array): a propensity score vector between 0 and 1 - treatment (np.array): a treatment vector (1 if treated, otherwise 0)

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

one_sided_att[source]

one_sided_att(alpha, p, treatment)

-
-

One sided confounding function for the average effect of the treatment among the treated units (ATT)

-

Reference: Blackwell, Matthew. "A selection bias approach to sensitivity analysis -for causal effects." Political Analysis 22.2 (2014): 169-182. -https://www.mattblackwell.org/files/papers/causalsens.pdf

-

Args: - alpha (np.array): a confounding values vector - p (np.array): a propensity score vector between 0 and 1 - treatment (np.array): a treatment vector (1 if treated, otherwise 0)

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

alignment_att[source]

alignment_att(alpha, p, treatment)

-
-

Alignment confounding function for the average effect of the treatment among the treated units (ATT)

-

Reference: Blackwell, Matthew. "A selection bias approach to sensitivity analysis -for causal effects." Political Analysis 22.2 (2014): 169-182. -https://www.mattblackwell.org/files/papers/causalsens.pdf

-

Args: - alpha (np.array): a confounding values vector - p (np.array): a propensity score vector between 0 and 1 - treatment (np.array): a treatment vector (1 if treated, otherwise 0)

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class Sensitivity[source]

Sensitivity(df, inference_features, p_col, treatment_col, outcome_col, learner, *args, **kwargs)

-
-

A Sensitivity Check class to support Placebo Treatment, Irrelevant Additional Confounder -and Subset validation refutation methods to verify causal inference.

-

Reference: https://github.com/microsoft/dowhy/blob/master/dowhy/causal_refuters/

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SensitivityPlaceboTreatment[source]

SensitivityPlaceboTreatment(*args, **kwargs) :: Sensitivity

-
-

Replaces the treatment variable with a new variable randomly generated.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SensitivityRandomCause[source]

SensitivityRandomCause(*args, **kwargs) :: Sensitivity

-
-

Adds an irrelevant random covariate to the dataframe.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SensitivityRandomReplace[source]

SensitivityRandomReplace(*args, **kwargs) :: Sensitivity

-
-

Replaces a random covariate with an irrelevant variable.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SensitivitySubsetData[source]

SensitivitySubsetData(*args, **kwargs) :: Sensitivity

-
-

Takes a random subset of size sample_size of the data.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class SensitivitySelectionBias[source]

SensitivitySelectionBias(*args, confound='one_sided', alpha_range=None, sensitivity_features=None, **kwargs) :: Sensitivity

-
-

Reference:

-

[1] Blackwell, Matthew. "A selection bias approach to sensitivity analysis -for causal effects." Political Analysis 22.2 (2014): 169-182. -https://www.mattblackwell.org/files/papers/causalsens.pdf

-

[2] Confouding parameter alpha_range using the same range as in: -https://github.com/mattblackwell/causalsens/blob/master/R/causalsens.R

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.slearner.html b/docs/meta.slearner.html deleted file mode 100644 index 74c44a8..0000000 --- a/docs/meta.slearner.html +++ /dev/null @@ -1,118 +0,0 @@ ---- - -title: S-Learner - - -keywords: fastai -sidebar: home_sidebar - -summary: "S-Learner" -description: "S-Learner" -nb_path: "nbs/05c_meta.slearner.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseSLearner[source]

BaseSLearner(learner=None, ate_alpha=0.05, control_name=0) :: BaseLearner

-
-

A parent class for S-learner classes. -An S-learner estimates treatment effects with one machine learning model. -Details of S-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseSRegressor[source]

BaseSRegressor(learner=None, ate_alpha=0.05, control_name=0) :: BaseSLearner

-
-

A parent class for S-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseSClassifier[source]

BaseSClassifier(learner=None, ate_alpha=0.05, control_name=0) :: BaseSLearner

-
-

A parent class for S-learner classifier classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.tleaerner.html b/docs/meta.tleaerner.html deleted file mode 100644 index 68247cf..0000000 --- a/docs/meta.tleaerner.html +++ /dev/null @@ -1,168 +0,0 @@ ---- - -title: T-Learner - - -keywords: fastai -sidebar: home_sidebar - -summary: "T-Learner" -description: "T-Learner" -nb_path: "nbs/05b_meta.tleaerner.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTLearner[source]

BaseTLearner(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseLearner

-
-

A parent class for T-learner regressor classes.

-

A T-learner estimates treatment effects with two machine learning models.

-

Details of T-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTRegressor[source]

BaseTRegressor(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseTLearner

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTClassifier[source]

BaseTClassifier(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseTLearner

-
-

A parent class for T-learner classifier classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class XGBTRegressor[source]

XGBTRegressor(ate_alpha=0.05, control_name=0, *args, **kwargs) :: BaseTRegressor

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class MLPTRegressor[source]

MLPTRegressor(ate_alpha=0.05, control_name=0, *args, **kwargs) :: BaseTRegressor

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.tlearner.html b/docs/meta.tlearner.html deleted file mode 100644 index 1314bed..0000000 --- a/docs/meta.tlearner.html +++ /dev/null @@ -1,168 +0,0 @@ ---- - -title: T-Learner - - -keywords: fastai -sidebar: home_sidebar - -summary: "T-Learner" -description: "T-Learner" -nb_path: "nbs/05b_meta.tlearner.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTLearner[source]

BaseTLearner(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseLearner

-
-

A parent class for T-learner regressor classes.

-

A T-learner estimates treatment effects with two machine learning models.

-

Details of T-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTRegressor[source]

BaseTRegressor(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseTLearner

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseTClassifier[source]

BaseTClassifier(learner=None, control_learner=None, treatment_learner=None, ate_alpha=0.05, control_name=0) :: BaseTLearner

-
-

A parent class for T-learner classifier classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class XGBTRegressor[source]

XGBTRegressor(ate_alpha=0.05, control_name=0, *args, **kwargs) :: BaseTRegressor

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class MLPTRegressor[source]

MLPTRegressor(ate_alpha=0.05, control_name=0, *args, **kwargs) :: BaseTRegressor

-
-

A parent class for T-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.utils.html b/docs/meta.utils.html deleted file mode 100644 index 6a8913c..0000000 --- a/docs/meta.utils.html +++ /dev/null @@ -1,591 +0,0 @@ ---- - -title: Metalearner Utils - - -keywords: fastai -sidebar: home_sidebar - -summary: "Metalearner Utils" -description: "Metalearner Utils" -nb_path: "nbs/05f_meta.utils.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

convert_pd_to_np[source]

convert_pd_to_np(*args)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

check_treatment_vector[source]

check_treatment_vector(treatment, control_name=None)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

check_p_conditions[source]

check_p_conditions(p, t_groups)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

check_explain_conditions[source]

check_explain_conditions(method, models, X=None, treatment=None, y=None)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

clean_xgboost_objective[source]

clean_xgboost_objective(objective)

-
-

Translate objective to be compatible with loaded xgboost version

-

Args

objective : string - The objective to translate.

-

Returns

The translated objective, or original if no translation was required.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

get_xgboost_objective_metric[source]

get_xgboost_objective_metric(objective)

-
-

Get the xgboost version-compatible objective and evaluation metric from a potentially version-incompatible input.

-

Args

objective : string - An xgboost objective that may be incompatible with the installed version.

-

Returns

A tuple with the translated objective and evaluation metric.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

ape[source]

ape(y, p)

-
-

Absolute Percentage Error (APE). -Args: - y (float): target - p (float): prediction

-

Returns: - e (float): APE

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

mape[source]

mape(y, p)

-
-

Mean Absolute Percentage Error (MAPE). -Args: - y (numpy.array): target - p (numpy.array): prediction

-

Returns: - e (numpy.float64): MAPE

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

smape[source]

smape(y, p)

-
-

Symmetric Mean Absolute Percentage Error (sMAPE). -Args: - y (numpy.array): target - p (numpy.array): prediction

-

Returns: - e (numpy.float64): sMAPE

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

rmse[source]

rmse(y, p)

-
-

Root Mean Squared Error (RMSE). -Args: - y (numpy.array): target - p (numpy.array): prediction

-

Returns: - e (numpy.float64): RMSE

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

gini[source]

gini(y, p)

-
-

Normalized Gini Coefficient.

-

Args: - y (numpy.array): target - p (numpy.array): prediction

-

Returns: - e (numpy.float64): normalized Gini coefficient

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

regression_metrics[source]

regression_metrics(y, p, w=None, metrics={'RMSE': <function rmse at 0x7f2b5b564a60>, 'sMAPE': <function smape at 0x7f2b5b5649d0>, 'Gini': <function gini at 0x7f2b5b564af0>})

-
-

Log metrics for regressors.

-

Args: - y (numpy.array): target - p (numpy.array): prediction - w (numpy.array, optional): a treatment vector (1 or True: treatment, 0 or False: control). If given, log - metrics for the treatment and control group separately - metrics (dict, optional): a dictionary of the metric names and functions

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

logloss[source]

logloss(y, p)

-
-

Bounded log loss error. -Args: - y (numpy.array): target - p (numpy.array): prediction -Returns: - bounded log loss error

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

classification_metrics[source]

classification_metrics(y, p, w=None, metrics={'AUC': <function roc_auc_score at 0x7f2b6c9455e0>, 'Log Loss': <function logloss at 0x7f2b5b564c10>})

-
-

Log metrics for classifiers.

-

Args: - y (numpy.array): target - p (numpy.array): prediction - w (numpy.array, optional): a treatment vector (1 or True: treatment, 0 or False: control). If given, log - metrics for the treatment and control group separately - metrics (dict, optional): a dictionary of the metric names and functions

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

smd[source]

smd(feature, treatment)

-
-

Calculate the standard mean difference (SMD) of a feature between the -treatment and control groups.

-

The definition is available at -https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3144483/#s11title

-

Args: - feature (pandas.Series): a column of a feature to calculate SMD for - treatment (pandas.Series): a column that indicate whether a row is in - the treatment group or not

-

Returns: - (float): The SMD of the feature

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

create_table_one[source]

create_table_one(data, treatment_col, features)

-
-

Report balance in input features between the treatment and control groups.

-

References: - R's tableone at CRAN: https://github.com/kaz-yos/tableone - Python's tableone at PyPi: https://github.com/tompollard/tableone

-

Args: - data (pandas.DataFrame): total or matched sample data - treatment_col (str): the column name for the treatment - features (list of str): the column names of features

-

Returns: - (pandas.DataFrame): A table with the means and standard deviations in - the treatment and control groups, and the SMD between two groups - for the features.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class NearestNeighborMatch[source]

NearestNeighborMatch(caliper=0.2, replace=False, ratio=1, shuffle=True, random_state=None)

-
-

Propensity score matching based on the nearest neighbor algorithm.

-

Attributes: - caliper (float): threshold to be considered as a match. - replace (bool): whether to match with replacement or not - ratio (int): ratio of control / treatment to be matched. used only if - replace=True. - shuffle (bool): whether to shuffle the treatment group data before - matching - random_state (numpy.random.RandomState or int): RandomState or an int - seed

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class MatchOptimizer[source]

MatchOptimizer(treatment_col='is_treatment', ps_col='pihat', user_col=None, matching_covariates=['pihat'], max_smd=0.1, max_deviation=0.1, caliper_range=(0.01, 0.5), max_pihat_range=(0.95, 0.999), max_iter_per_param=5, min_users_per_group=1000, smd_cols=['pihat'], dev_cols_transformations={'pihat': <function mean at 0x7f2ba076db80>}, dev_factor=1.0, verbose=True)

-
- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/meta.xlearner.html b/docs/meta.xlearner.html deleted file mode 100644 index d704b8c..0000000 --- a/docs/meta.xlearner.html +++ /dev/null @@ -1,118 +0,0 @@ ---- - -title: X-Learner - - -keywords: fastai -sidebar: home_sidebar - -summary: "X-Learner" -description: "X-Learner" -nb_path: "nbs/05d_meta.xlearner.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseXLearner[source]

BaseXLearner(learner=None, control_outcome_learner=None, treatment_outcome_learner=None, control_effect_learner=None, treatment_effect_learner=None, ate_alpha=0.05, control_name=0) :: BaseLearner

-
-

A parent class for X-learner regressor classes.

-

An X-learner estimates treatment effects with four machine learning models.

-

Details of X-learner are available at Kunzel et al. (2018) (https://arxiv.org/abs/1706.03461).

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseXRegressor[source]

BaseXRegressor(learner=None, control_outcome_learner=None, treatment_outcome_learner=None, control_effect_learner=None, treatment_effect_learner=None, ate_alpha=0.05, control_name=0) :: BaseXLearner

-
-

A parent class for X-learner regressor classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class BaseXClassifier[source]

BaseXClassifier(outcome_learner=None, effect_learner=None, control_outcome_learner=None, treatment_outcome_learner=None, control_effect_learner=None, treatment_effect_learner=None, ate_alpha=0.05, control_name=0) :: BaseXLearner

-
-

A parent class for X-learner classifier classes.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - -
- - diff --git a/docs/preprocessing.html b/docs/preprocessing.html deleted file mode 100644 index e385f4f..0000000 --- a/docs/preprocessing.html +++ /dev/null @@ -1,679 +0,0 @@ ---- - -title: Preprocessing - - -keywords: fastai -sidebar: home_sidebar - -summary: "Preprocesses dataset" -description: "Preprocesses dataset" -nb_path: "nbs/04_preprocessing.ipynb" ---- - - -
- - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

class DataframePreprocessor[source]

DataframePreprocessor(treatment_col='treatment', outcome_col='outcome', text_col=None, include_cols=[], ignore_cols=[], verbose=1)

-
-

Preproceses a pandas DataFrame for causal inference

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
- -
- {% endraw %} - - {% raw %} - -
- -
-
- -
- - -
-

DataframePreprocessor.preprocess[source]

DataframePreprocessor.preprocess(df, training=False, min_df=0.05, max_df=0.5, ngram_range=(1, 1), stop_words='english', na_cont_value=-1, na_cat_value='MISSING')

-
-

Preprocess a dataframe for causal inference.

- -
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
import pandas as pd
-df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
pp = DataframePreprocessor(treatment_col='T_ac', outcome_col='Y_sim', 
-                           text_col='text', include_cols=['C_true', 'product'])
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
df, X, Y, T = pp.preprocess(df, training=True)
-
- -
-
-
- -
-
- -
- -
-
outcome column (categorical): Y_sim
-treatment column: T_ac
-numerical/categorical covariates: ['product', 'C_true']
-text covariate: text
-preprocess time:  1.49556303024292  sec
-
-
-
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
X.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
C_trueproduct_audio cdproduct_mp3 musicproduct_vinylv_albumv_albumsv_bandv_beautifulv_bestv_betterv_boughtv_buyv_cdv_collectionv_didv_donv_excellentv_fanv_favoritev_goodv_gotv_greatv_hearv_heardv_justv_knowv_likev_listenv_listeningv_lovev_musicv_newv_oldv_originalv_reallyv_recordv_recordingv_rockv_songv_songsv_soundv_soundsv_thinkv_timev_trackv_tracksv_vev_voicev_wayv_workv_years
000100.252320.00.00.00.00.00.00.8507980.2516790.00.00.3861810.00.00.00.00.00.00.00.0000000.00.0000000.0000000.00.00.0000000.0000000.00.00.000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
100100.000000.00.00.00.00.00.00.0000000.0000000.00.00.0000000.00.00.00.00.00.00.00.0000000.00.0000000.0000000.00.00.0000000.0000000.00.00.000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
211000.000000.00.00.00.00.00.00.0000000.5422500.00.00.0000000.00.00.00.00.00.00.00.0000000.00.0000000.0000000.00.00.6251380.5613980.00.00.000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
300100.000000.00.00.00.00.00.00.0000000.0000000.00.00.0000000.00.00.00.00.00.00.00.0000000.00.6291060.0000000.00.00.0000000.7773190.00.00.000000.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
411000.000000.00.00.00.00.00.00.0000000.0000000.00.00.0000000.00.00.00.00.00.00.00.5277510.00.0000000.3925720.00.00.3729820.3349520.00.00.562190.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-    'C_true' : [0, 1],
-    'product': ['vinyl', 'mp3 music'],
-     'text' : ['This record hurts my ears.', "The music of Yanni is beautiful and breath-taking."],
-    'Y_sim' : [0, 1],
-     'T_ac' : [0, 1],
-      })
-test_df.head()
-
- -
-
-
- -
-
- -
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
C_trueproducttextY_simT_ac
00vinylThis record hurts my ears.00
11mp3 musicThe music of Yanni is beautiful and breath-tak...11
-
- -
- -
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
_, X_test, _, _ = pp.preprocess(test_df, training=False)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
assert sum([X_test.columns.values[i] == col for i,col in enumerate(X.columns.values)]) == len(X.columns.values)
-
- -
-
-
- -
- {% endraw %} - - {% raw %} - -
-
- -
-
-
test_df = pd.DataFrame({
-    'product': ['vinyl', 'mp3 music'],
-     'text' : ['This record hurts my ears.', "The music of Yanni is beautiful and breath-taking."],
-    'Y_sim' : [0, 1],
-     'T_ac' : [0, 1],
-      })
-error = False
-try: 
-    _, X_test, _, _ = pp.preprocess(test_df, training=False)
-except ValueError:
-    error = True
-assert error is True
-
- -
-
-
- -
- {% endraw %} - -
- - diff --git a/docs/sidebar.json b/docs/sidebar.json deleted file mode 100644 index f407ae6..0000000 --- a/docs/sidebar.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "causalnlp": { - "Overview": "/", - "Causal Inference": "core.causalinference.html", - "CausalBert": "core.causalbert.html", - "Auto Coder": "autocoder.html", - "Analyzers": "analyzers.html", - "Key Driver Analysis": "key_driver_analysis.html", - "Preprocessing": "preprocessing.html", - "Base Metalearner": "meta.base.html", - "T-Learner": "meta.tlearner.html", - "S-Learner": "meta.slearner.html", - "X-Learner": "meta.xlearner.html", - "R-Learner": "meta.rlearner.html", - "Metalearner Utils": "meta.utils.html", - "Metalearner Explainer": "meta.explainer.html", - "Metalearner Propensity": "meta.propensity.html", - "Metalearner Sensitivity": "meta.sensitivity.html", - "Examples": "examples.html" - } -} \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml deleted file mode 100644 index 38a04d6..0000000 --- a/docs/sitemap.xml +++ /dev/null @@ -1,24 +0,0 @@ ---- -layout: none -search: exclude ---- - - - - {% for post in site.posts %} - {% unless post.search == "exclude" %} - - {{site.url}}{{post.url}} - - {% endunless %} - {% endfor %} - - - {% for page in site.pages %} - {% unless page.search == "exclude" %} - - {{site.url}}{{ page.url}} - - {% endunless %} - {% endfor %} - \ No newline at end of file diff --git a/nbs/00a_core.causalinference.ipynb b/nbs/00a_core.causalinference.ipynb index 5bfc037..77d36e7 100644 --- a/nbs/00a_core.causalinference.ipynb +++ b/nbs/00a_core.causalinference.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Causal Inference API\n", + "output-file: core.causalinference.html\n", + "title: Causal Inference\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp core.causalinference" + "#| default_exp core.causalinference" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Causal Inference\n", - "\n", - "> Causal Inference API" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", "import time\n", @@ -1786,8 +1790,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/00b_core.causalbert.ipynb b/nbs/00b_core.causalbert.ipynb index 442e300..009e7ee 100644 --- a/nbs/00b_core.causalbert.ipynb +++ b/nbs/00b_core.causalbert.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: CausalBert API\n", + "output-file: core.causalbert.html\n", + "title: CausalBert\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp core.causalbert" + "#| default_exp core.causalbert" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# CausalBert\n", - "\n", - "> CausalBert API" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# An extensible implementation of the Causal Bert model from \n", "# \"Adapting Text Embeddings for Causal Inference\" \n", @@ -360,7 +364,7 @@ " data = TensorDataset(*data)\n", " sampler = RandomSampler(data) if sampler == 'random' else SequentialSampler(data)\n", " dataloader = DataLoader(data, sampler=sampler, batch_size=self.batch_size)\n", - " return dataloader\n" + " return dataloader" ] }, { @@ -481,7 +485,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/music_seed50.tsv', sep='\\t', error_bad_lines=False)\n", "from causalnlp.core.causalbert import CausalBertModel\n", @@ -527,8 +531,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/01_autocoder.ipynb b/nbs/01_autocoder.ipynb index 38e91a6..e9a562b 100644 --- a/nbs/01_autocoder.ipynb +++ b/nbs/01_autocoder.ipynb @@ -1,12 +1,26 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Automatically codes text fields such as open-ended survey questions based\n", + " on lingustic properties such as topic and sentiment.\n", + "output-file: autocoder.html\n", + "title: Auto Coder\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp autocoder" + "#| default_exp autocoder" ] }, { @@ -15,28 +29,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Auto Coder\n", - "\n", - "> Automatically codes text fields such as open-ended survey questions based on lingustic properties such as topic and sentiment." - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +51,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", @@ -865,7 +870,7 @@ "metadata": {}, "outputs": [], "source": [ - "#notest\n", + "#| notest\n", "df = ac.code_transformer(df.review.values, df)" ] }, @@ -2710,7 +2715,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "df.head()" ] }, @@ -3196,8 +3201,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/02_analyzers.ipynb b/nbs/02_analyzers.ipynb index daa83aa..a6938e2 100644 --- a/nbs/02_analyzers.ipynb +++ b/nbs/02_analyzers.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Text analyzers to help create text-based covariates, treatments, or outcomes\n", + " for causal analyses.\n", + "output-file: analyzers.html\n", + "title: Analyzers\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,7 +21,7 @@ "metadata": {}, "outputs": [], "source": [ - "# default_exp analyzers" + "#| default_exp analyzers" ] }, { @@ -17,23 +31,13 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "#all_notest\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "id": "0537864b", - "metadata": {}, - "source": [ - "# Analyzers\n", - "\n", - "> Text analyzers to help create text-based covariates, treatments, or outcomes for causal analyses." - ] - }, { "cell_type": "code", "execution_count": null, @@ -41,7 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -52,7 +56,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "import math\n", "import warnings\n", @@ -71,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "class ZeroShotClassifier():\n", " \"\"\"\n", @@ -284,7 +288,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "#from sentence_transformers import SentenceTransformer, util\n", "\n", @@ -573,7 +577,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from sklearn.decomposition import NMF, LatentDirichletAllocation\n", @@ -867,7 +871,7 @@ "\n", " def _check_model(self):\n", " if self.model is None or self.vectorizer is None:\n", - " raise Exception('Must call train()')\n" + " raise Exception('Must call train()')" ] }, { @@ -1230,8 +1234,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/03_key_driver_analysis.ipynb b/nbs/03_key_driver_analysis.ipynb index 83abb83..f33c7a0 100644 --- a/nbs/03_key_driver_analysis.ipynb +++ b/nbs/03_key_driver_analysis.ipynb @@ -1,12 +1,27 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Key driver analysis to yield clues into **potential** causal relationships\n", + " in your data by determining variables with high predictive power, high correlation\n", + " with outcome, etc.\n", + "output-file: key_driver_analysis.html\n", + "title: Key Driver Analysis\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp key_driver_analysis" + "#| default_exp key_driver_analysis" ] }, { @@ -15,28 +30,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Key Driver Analysis\n", - "\n", - "> Key driver analysis to yield clues into **potential** causal relationships in your data by determining variables with high predictive power, high correlation with outcome, etc." - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "try:\n", " import google.colab\n", " IN_COLAB = True\n", @@ -54,7 +60,7 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -64,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", @@ -579,7 +585,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/adult-census.csv')\n", "kda = KeyDriverAnalysis(df, outcome_col='class', ignore_cols=['fnlwgt'])\n", diff --git a/nbs/04_preprocessing.ipynb b/nbs/04_preprocessing.ipynb index 242d25a..321d2cb 100644 --- a/nbs/04_preprocessing.ipynb +++ b/nbs/04_preprocessing.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Preprocesses dataset\n", + "output-file: preprocessing.html\n", + "title: Preprocessing\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp preprocessing" + "#| default_exp preprocessing" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preprocessing\n", - "\n", - "> Preprocesses dataset" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", @@ -884,8 +888,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05a_meta.base.ipynb b/nbs/05a_meta.base.ipynb index ff95fa5..3fadc96 100644 --- a/nbs/05a_meta.base.ipynb +++ b/nbs/05a_meta.base.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Metalearner Base\n", + "output-file: meta.base.html\n", + "title: Base Metalearner\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.base" + "#| default_exp meta.base" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Base Metalearner\n", - "\n", - "> Metalearner Base" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -331,8 +335,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05b_meta.tlearner.ipynb b/nbs/05b_meta.tlearner.ipynb index 8caf388..789a4a6 100644 --- a/nbs/05b_meta.tlearner.ipynb +++ b/nbs/05b_meta.tlearner.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: T-Learner\n", + "output-file: meta.tlearner.html\n", + "title: T-Learner\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.tlearner" + "#| default_exp meta.tlearner" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# T-Learner\n", - "\n", - "> T-Learner" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -457,8 +461,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05c_meta.slearner.ipynb b/nbs/05c_meta.slearner.ipynb index 6645783..2a6bf27 100644 --- a/nbs/05c_meta.slearner.ipynb +++ b/nbs/05c_meta.slearner.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: S-Learner\n", + "output-file: meta.slearner.html\n", + "title: S-Learner\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.slearner" + "#| default_exp meta.slearner" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# S-Learner\n", - "\n", - "> S-Learner" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -374,7 +378,7 @@ " if not return_components:\n", " return te\n", " else:\n", - " return te, yhat_cs, yhat_ts\n" + " return te, yhat_cs, yhat_ts" ] }, { @@ -407,8 +411,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05d_meta.xlearner.ipynb b/nbs/05d_meta.xlearner.ipynb index 3928dd3..41eb97d 100644 --- a/nbs/05d_meta.xlearner.ipynb +++ b/nbs/05d_meta.xlearner.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: X-Learner\n", + "output-file: meta.xlearner.html\n", + "title: X-Learner\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.xlearner" + "#| default_exp meta.xlearner" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# X-Learner\n", - "\n", - "> X-Learner" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -633,8 +637,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05e_meta.rlearner.ipynb b/nbs/05e_meta.rlearner.ipynb index c8ce05d..0c04ce9 100644 --- a/nbs/05e_meta.rlearner.ipynb +++ b/nbs/05e_meta.rlearner.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: R-Learner\n", + "output-file: meta.rlearner.html\n", + "title: R-Learner\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.rlearner" + "#| default_exp meta.rlearner" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# R-Learner\n", - "\n", - "> R-Learner" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -610,8 +614,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05f_meta.utils.ipynb b/nbs/05f_meta.utils.ipynb index cfdc618..66b3f06 100644 --- a/nbs/05f_meta.utils.ipynb +++ b/nbs/05f_meta.utils.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Metalearner Utils\n", + "output-file: meta.utils.html\n", + "title: Metalearner Utils\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.utils" + "#| default_exp meta.utils" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Metalearner Utils\n", - "\n", - "> Metalearner Utils" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -174,7 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "EPS = 1e-15" ] }, @@ -184,7 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "import logging\n", "import numpy as np\n", @@ -317,7 +321,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "import logging\n", "from sklearn.metrics import log_loss, roc_auc_score\n", "\n", @@ -348,7 +352,7 @@ " metrics for the treatment and control group separately\n", " metrics (dict, optional): a dictionary of the metric names and functions\n", " \"\"\"\n", - " regression_metrics(y=y, p=p, w=w, metrics=metrics)\n" + " regression_metrics(y=y, p=p, w=w, metrics=metrics)" ] }, { @@ -357,7 +361,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "import argparse\n", "import logging\n", @@ -720,7 +724,7 @@ " if self.verbose:\n", " logger.info('\\n-----\\nBest params are:\\n{}'.format(self.best_params))\n", "\n", - " return self.best_matched\n" + " return self.best_matched" ] }, { @@ -751,8 +755,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05g_meta.explainer.ipynb b/nbs/05g_meta.explainer.ipynb index 555ecfb..34510fb 100644 --- a/nbs/05g_meta.explainer.ipynb +++ b/nbs/05g_meta.explainer.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Metalearner Expainer\n", + "output-file: meta.explainer.html\n", + "title: Metalearner Explainer\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.explainer" + "#| default_exp meta.explainer" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Metalearner Explainer\n", - "\n", - "> Metalearner Expainer" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -309,8 +313,8 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05h_meta.propensity.ipynb b/nbs/05h_meta.propensity.ipynb index 95b98e7..7e7d908 100644 --- a/nbs/05h_meta.propensity.ipynb +++ b/nbs/05h_meta.propensity.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Metalearner Propensity\n", + "output-file: meta.propensity.html\n", + "title: Metalearner Propensity\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.propensity" + "#| default_exp meta.propensity" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Metalearner Propensity\n", - "\n", - "> Metalearner Propensity" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -341,8 +345,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/05i_meta.sensitivity.ipynb b/nbs/05i_meta.sensitivity.ipynb index 44f3e83..a7fe9c4 100644 --- a/nbs/05i_meta.sensitivity.ipynb +++ b/nbs/05i_meta.sensitivity.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Metalearner Sensitivity\n", + "output-file: meta.sensitivity.html\n", + "title: Metalearner Sensitivity\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# default_exp meta.sensitivity" + "#| default_exp meta.sensitivity" ] }, { @@ -15,28 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Metalearner Sensitivity\n", - "\n", - "> Metalearner Sensitivity" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "from nbdev.showdoc import *" ] }, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#export\n", + "#| export\n", "\n", "# REFERENCE: https://github.com/uber/causalml\n", "\n", @@ -583,7 +587,7 @@ " feature_name, partial_rsqs_value, confounding_min, confounding_max))\n", " return [confounding_min, confounding_max]\n", " else:\n", - " logger.info('Cannot find correponding rsquare value within the range for input, please edit confounding', 'values vector or use a larger range and try again')\n" + " logger.info('Cannot find correponding rsquare value within the range for input, please edit confounding', 'values vector or use a larger range and try again')" ] }, { @@ -615,8 +619,8 @@ } ], "source": [ - "#hide\n", - "from nbdev.export import notebook2script; notebook2script()" + "#| include: false\n", + "from nbdev import nbdev_export; nbdev_export()" ] }, { diff --git a/nbs/99_examples.ipynb b/nbs/99_examples.ipynb index 84ef4b6..816dcfa 100644 --- a/nbs/99_examples.ipynb +++ b/nbs/99_examples.ipynb @@ -1,26 +1,30 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "description: Various examples of CausalNLP on semi-simulated or real datasets.\n", + "output-file: examples.html\n", + "title: Examples\n", + "\n", + "---\n", + "\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Examples\n", - "\n", - "> Various examples of CausalNLP on semi-simulated or real datasets." - ] - }, { "cell_type": "code", "execution_count": null, @@ -46,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "#notest\n", + "#| notest\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/music_seed50.tsv', sep='\\t', error_bad_lines=False)" ] @@ -213,7 +217,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "df.head()" ] }, @@ -269,7 +273,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "from lightgbm import LGBMClassifier\n", "from sklearn.linear_model import LogisticRegression, LinearRegression\n", "cm = CausalInferenceModel(df, method='t-learner',\n", @@ -306,7 +310,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.estimate_ate()" ] }, @@ -333,7 +337,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "from collections import defaultdict\n", "import numpy as np\n", "def ATE_adjusted(C, T, Y):\n", @@ -449,7 +453,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.evaluate_robustness()" ] }, @@ -486,7 +490,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "series = df['text']\n", "cm.estimate_ate(df['text'].str.contains('toddler'))" ] @@ -517,7 +521,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.get_required_columns()" ] }, @@ -527,7 +531,7 @@ "metadata": {}, "outputs": [], "source": [ - "#notest\n", + "#| notest\n", "test_df = pd.DataFrame({\n", " 'T_ac' : [1],\n", " 'C_true' : [1],\n", @@ -552,7 +556,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.predict(test_df)" ] }, @@ -592,7 +596,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.interpret(plot=False, method='feature_importance')[1][:10]" ] }, @@ -629,7 +633,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.explain(test_df, row_num=0)" ] }, @@ -654,7 +658,7 @@ } ], "source": [ - "#hide\n", + "#| include: false\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/houses.csv')\n", "fn = lambda x: 1 if x == 'Abnorml' else 0\n", @@ -859,7 +863,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/adult-census.csv')\n", "df = df.rename(columns=lambda x: x.strip())\n", @@ -889,7 +893,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "from causalnlp import CausalInferenceModel\n", "cm = CausalInferenceModel(df, method='t-learner',\n", " treatment_col='treatment', \n", @@ -921,7 +925,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.estimate_ate()" ] }, @@ -949,7 +953,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.estimate_ate(cm.df['education'] == 'Masters')" ] }, @@ -977,7 +981,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "cm.estimate_ate(cm.df['education'].isin(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '12th']))" ] }, @@ -1126,7 +1130,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "import pandas as pd\n", "df = pd.read_csv('sample_data/lalonde.csv')\n", "df.head()" @@ -1159,7 +1163,7 @@ } ], "source": [ - "#notest\n", + "#| notest\n", "from causalnlp import CausalInferenceModel\n", "cm = CausalInferenceModel(df, method='s-learner',\n", " treatment_col='treat', \n", diff --git a/nbs/index.ipynb b/nbs/index.ipynb index 14154aa..6322f9f 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -1,10 +1,15 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ - "# Welcome to CausalNLP" + "---\n", + "output-file: index.html\n", + "title: Welcome to CausalNLP\n", + "\n", + "---\n", + "\n" ] }, { @@ -75,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "#hide\n", + "#| include: false\n", "#all_notest" ] }, diff --git a/setup.py b/setup.py index 2ec2c98..ca609dc 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ from pkg_resources import parse_version from configparser import ConfigParser -import setuptools,re,sys +import setuptools, shlex assert parse_version(setuptools.__version__)>=parse_version('36.2') # note: all settings are in settings.ini; edit there, not here config = ConfigParser(delimiters=['=']) -config.read('settings.ini') +config.read('settings.ini', encoding='utf-8') cfg = config['DEFAULT'] cfg_keys = 'version description keywords author author_email'.split() @@ -13,10 +13,6 @@ for o in expected: assert o in cfg, "missing expected setting: {}".format(o) setup_cfg = {o:cfg[o] for o in cfg_keys} -if len(sys.argv)>1 and sys.argv[1]=='version': - print(setup_cfg['version']) - exit() - licenses = { 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 'mit': ('MIT License', 'OSI Approved :: MIT License'), @@ -26,22 +22,14 @@ } statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] -py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split() +py_versions = '3.6 3.7 3.8 3.9 3.10 3.11 3.12'.split() -lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) +requirements = shlex.split(cfg.get('requirements', '')) +if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', '')) min_python = cfg['min_python'] - -requirements = ['pip', 'packaging'] -if cfg.get('requirements'): requirements += cfg.get('requirements','').split() -if cfg.get('pip_requirements'): requirements += cfg.get('pip_requirements','').split() +lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) dev_requirements = (cfg.get('dev_requirements') or '').split() -long_description = open('README.md').read() -# ![png](docs/images/output_13_0.png) -for ext in ['png', 'svg']: - long_description = re.sub(r'!\['+ext+'\]\((.*)\)', '!['+ext+']('+'https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1)', long_description) - long_description = re.sub(r'src=\"(.*)\.'+ext+'\"', 'src=\"https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1.'+ext+'\"', long_description) - setuptools.setup( name = cfg['lib_name'], license = lic[0], @@ -55,10 +43,15 @@ include_package_data = True, install_requires = requirements, extras_require={ 'dev': dev_requirements }, + dependency_links = cfg.get('dep_links','').split(), python_requires = '>=' + cfg['min_python'], - long_description = long_description, + long_description = open('README.md', encoding='utf-8').read(), long_description_content_type = 'text/markdown', zip_safe = False, - entry_points = { 'console_scripts': cfg.get('console_scripts','').split() }, + entry_points = { + 'console_scripts': cfg.get('console_scripts','').split(), + 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] + }, **setup_cfg) + diff --git a/styles.css b/styles.css new file mode 100644 index 0000000..66ccc49 --- /dev/null +++ b/styles.css @@ -0,0 +1,37 @@ +.cell { + margin-bottom: 1rem; +} + +.cell > .sourceCode { + margin-bottom: 0; +} + +.cell-output > pre { + margin-bottom: 0; +} + +.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { + margin-left: 0.8rem; + margin-top: 0; + background: none; + border-left: 2px solid lightsalmon; + border-top-left-radius: 0; + border-top-right-radius: 0; +} + +.cell-output > .sourceCode { + border: none; +} + +.cell-output > .sourceCode { + background: none; + margin-top: 0; +} + +div.description { + padding-left: 2px; + padding-top: 5px; + font-style: italic; + font-size: 135%; + opacity: 70%; +}