diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d711ddf..9f37522 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,6 +35,19 @@ In case of Google Cloud Identity-Aware Proxy, please specify these env variables - `IAP_AUTH_CLIENT_ID` - # pick [client ID of the application](https://console.cloud.google.com/apis/credentials) you are connecting to - `IAP_AUTH_SERVICE_ACCOUNT_JSON` - # generate in [Actions](https://console.cloud.google.com/iam-admin/serviceaccounts) -> Create key -> JSON +#### Custom headers + +Using `headers` configuration: + +```json +{ + "index_name": "my_index", + "headers": { + "Authorization": "Bearer " + } +}, +``` + ### Installing Chrome Headless Websites that need JavaScript for rendering are passed through ChromeDriver. diff --git a/scraper/src/config/config_loader.py b/scraper/src/config/config_loader.py index 1fff0fd..975f625 100644 --- a/scraper/src/config/config_loader.py +++ b/scraper/src/config/config_loader.py @@ -33,6 +33,7 @@ class ConfigLoader: index_name_tmp = None js_wait = 0 js_render = False + headers = None keep_tags = [] min_indexed_level = 0 remove_get_params = False diff --git a/scraper/src/index.py b/scraper/src/index.py index c26e94b..9eaa963 100644 --- a/scraper/src/index.py +++ b/scraper/src/index.py @@ -53,6 +53,9 @@ def run_config(config): "Accept-Language": "en", } # Defaults for scrapy https://docs.scrapy.org/en/latest/topics/settings.html#default-request-headers + if config.headers is not None: + headers.update(config.headers) + # Cloudflare Zero Trust (CF) if (os.getenv("CF_ACCESS_CLIENT_ID") and os.getenv("CF_ACCESS_CLIENT_SECRET")): diff --git a/scraper/src/tests/config_loader/headers_test.py b/scraper/src/tests/config_loader/headers_test.py new file mode 100644 index 0000000..c049462 --- /dev/null +++ b/scraper/src/tests/config_loader/headers_test.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from ...config.config_loader import ConfigLoader +from .abstract import config +import pytest + + +class TestInit: + def test_header(self): + """ Should have a header """ + # Given + c = config({ + 'headers': { + 'Authorization': 'Bearer xyz' + } + }) + + config_loaded = ConfigLoader(c) + + assert config_loaded.headers == { + 'Authorization': 'Bearer xyz' + } + + def test_multiple_header(self): + """ Should have headers """ + # Given + c = config({ + 'headers': { + 'Authorization': 'Bearer xyz', + 'Custom': 1, + } + }) + + config_loaded = ConfigLoader(c) + + assert config_loaded.headers == { + 'Authorization': 'Bearer xyz', + 'Custom': 1, + }