apify-projects · metalwarrior665 · Jan 13, 2024 · Jan 13, 2024 · Jan 13, 2024 · Jan 13, 2024
diff --git a/actors/contacts-scraper/.actor/actor.json b/actors/contacts-scraper/.actor/actor.json
@@ -0,0 +1,18 @@
+{
+	"actorSpecification": 1,
+	"name": "gpt-contacts-scraper",
+	"title": "GPT Contacts Scraper",
+	"description": "Crawler uses OpenAI API",
+	"version": "0.0",
+	"meta": {
+		"templateId": "ts-crawlee-playwright-chrome"
+	},
+	"input": "./input_schema.json",
+	"readme": "./README.md",
+	"dockerfile": "../../../shared/Dockerfile",
+    "changelog":"../../../shared/CHANGELOG.md",
+    "storages": {
+        "dataset": "../../../shared/dataset_schema.json"
+    },
+	"dockerContextDir": "../../.."
+}
diff --git a/actors/contacts-scraper/.actor/input_schema.json b/actors/contacts-scraper/.actor/input_schema.json
@@ -0,0 +1,118 @@
+{
+    "title": "GPT Contacts Scraper",
+    "type": "object",
+    "description": "The crawler scrapes contact details from pages via GPT",
+    "schemaVersion": 1,
+    "properties": {
+        "startUrls": {
+            "title": "Start URLs",
+            "type": "array",
+            "description": "A static list of URLs to scrape. <br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#start-urls' target='_blank' rel='noopener'>Start URLs</a> in README.",
+            "prefill": [
+                { "url": "https://news.ycombinator.com/" }
+            ],
+            "editor": "requestListSources"
+        },
+        "includeUrlGlobs": {
+            "title": "Include URLs (globs)",
+            "type": "array",
+            "description": "Glob patterns matching URLs of pages that will be included in crawling. Combine them with the link selector to tell the scraper where to find links. Omitting the glob patterns will cause the scraper to enqueue all links matched by the link selector.",
+            "editor": "globs",
+            "default": [],
+            "prefill": []
+        },
+        "excludeUrlGlobs": {
+            "title": "Exclude URLs (globs)",
+            "type": "array",
+            "description": "Glob patterns matching URLs of pages that will be excluded from crawling. Note that this affects only links found on pages, but not Start URLs, which are always crawled.",
+            "editor": "globs",
+            "default": [],
+            "prefill": []
+        },
+        "linkSelector": {
+            "title": "Link selector",
+            "type": "string",
+            "description": "This is a CSS selector that says which links on the page (<code>&lt;a&gt;</code> elements with <code>href</code> attribute) should be followed and added to the request queue. To filter the links added to the queue, use the <b>Pseudo-URLs</b> setting.<br><br>If <b>Link selector</b> is empty, the page links are ignored.<br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#link-selector' target='_blank' rel='noopener'>Link selector</a> in README.",
+            "editor": "textfield",
+            "prefill": "a[href]"
+        },
+        "initialCookies": {
+            "title": "Initial cookies",
+            "type": "array",
+            "description": "Cookies that will be pre-set to all pages the scraper opens. This is useful for pages that require login. The value is expected to be a JSON array of objects with `name`, `value`, 'domain' and 'path' properties. For example: `[{\"name\": \"cookieName\", \"value\": \"cookieValue\"}, \"domain\": \".domain.com\", \"path\": \"/\"}]`.\n\nYou can use the [EditThisCookie](https://chrome.google.com/webstore/detail/editthiscookie/fngmhnnpilhplaeedifhccceomclgfbg) browser extension to copy browser cookies in this format, and paste it here.",
+            "default": [],
+            "prefill": [],
+            "editor": "json"
+        },
+        "openaiApiKey": {
+            "title": "OpenAI API key",
+            "type": "string",
+            "description": "The API key for accessing OpenAI. You can get it from <a href='https://platform.openai.com/account/api-keys' target='_blank' rel='noopener'>OpenAI platform</a>.",
+            "editor": "textfield",
+            "isSecret": true
+        },
+        "targetSelector": {
+            "title": "Content selector",
+            "type": "string",
+            "description": "A CSS selector of the HTML element on the page that will be used in the instruction. Instead of a whole page, you can use only part of the page. For example: \"div#content\".",
+            "editor": "textfield",
+            "prefill": ""
+        },
+        "removeElementsCssSelector": {
+            "title": "Remove HTML elements (CSS selector)",
+            "type": "string",
+            "description": "A CSS selector matching HTML elements that will be removed from the DOM, before sending it to GPT processing. This is useful to skip irrelevant page content and save on GPT input tokens. \n\nBy default, the Actor removes usually unwanted elements like scripts, styles and inline images. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
+            "editor": "textarea",
+            "default": "script, style, noscript, path, svg, xlink",
+            "prefill": "script, style, noscript, path, svg, xlink"
+        },
+        "maxCrawlingDepth": {
+            "title": "Max crawling depth",
+            "type": "integer",
+            "description": "This specifies how many links away from the <b>Start URLs</b> the scraper will descend. This value is a safeguard against infinite crawling depths for misconfigured scrapers.<br><br>If set to <code>0</code>, there is no limit.",
+            "minimum": 0,
+            "default": 0
+        },
+        "maxPagesPerCrawl": {
+            "title": "Max pages per run",
+            "type": "integer",
+            "description": "Maximum number of pages that the scraper will open. 0 means unlimited.",
+            "minimum": 0,
+            "default": 10,
+            "unit": "pages"
+        },
+        "skipGptGlobs": {
+            "title": "Skip GPT processing for Globs",
+            "type": "array",
+            "description": "This setting allows you to specify certain page URLs to skip GPT instructions for. Pages matching these glob patterns will only be crawled for links, excluding them from GPT processing. Useful for intermediary pages used for navigation or undesired content.",
+            "editor": "globs",
+            "default": [],
+            "prefill": []
+        },
+        "proxyConfiguration": {
+            "sectionCaption": "Advanced configuration",
+            "title": "Proxy configuration",
+            "type": "object",
+            "description": "This specifies the proxy servers that will be used by the scraper in order to hide its origin.<br><br>For details, see <a href='https://apify.com/drobnikj/extended-gpt-scraper#proxy-configuration' target='_blank' rel='noopener'>Proxy configuration</a> in README.",
+            "prefill": { "useApifyProxy": true },
+            "default": { "useApifyProxy": false },
+            "editor": "proxy"
+        },
+        "pageFormatInRequest": {
+            "title": "Page format in request",
+            "type": "string",
+            "description": "In what format to send the content extracted from the page to the GPT. Markdown will take less space allowing for larger requests, while HTML may help include some information like attributes that may otherwise be omitted.",
+            "enum": ["HTML", "Markdown"],
+            "enumTitles": ["HTML", "Markdown"],
+            "default": "Markdown"
+        },
+        "saveSnapshots": {
+            "title": "Save debug snapshots",
+            "type": "boolean",
+            "description": "For each page store its HTML, screenshot and parsed content (markdown/HTML as it was sent to ChatGPT) adding links to these into the output",
+            "editor": "checkbox",
+            "default": true
+        }
+    },
+    "required": ["startUrls", "openaiApiKey"]
+}
diff --git a/actors/contacts-scraper/.dockerignore b/actors/contacts-scraper/.dockerignore
@@ -0,0 +1,10 @@
+# configurations
+.idea
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules
diff --git a/actors/contacts-scraper/.gitignore b/actors/contacts-scraper/.gitignore
@@ -0,0 +1,8 @@
+# This file tells Git which files shouldn't be added to source control
+
+.idea
+dist
+node_modules
+apify_storage
+crawlee_storage
+storage
diff --git a/actors/contacts-scraper/Dockerfile b/actors/contacts-scraper/Dockerfile
@@ -0,0 +1,51 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://crawlee.dev/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node-playwright-chrome:18 AS builder
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install all dependencies. Don't audit to speed up the installation.
+RUN npm install --include=dev --audit=false
+
+# Next, copy the source files using the user set
+# in the base image.
+COPY --chown=myuser . ./
+
+# Install all dependencies and build the project.
+# Don't audit to speed up the installation.
+RUN npm run build
+
+# Create final image
+FROM apify/actor-node-playwright-chrome:18
+
+# Copy only built JS files from builder image
+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY --chown=myuser . ./
+
+
+# Run the image. If you know you won't need headful browsers,
+# you can remove the XVFB start script for a micro perf gain.
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
diff --git a/actors/contacts-scraper/README.md b/actors/contacts-scraper/README.md
@@ -0,0 +1,9 @@
+# Getting started with Crawlee
+
+This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
+
+You can find more examples and documentation at the following links:
+
+- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
+- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
+- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
diff --git a/actors/contacts-scraper/package.json b/actors/contacts-scraper/package.json
@@ -0,0 +1,25 @@
+{
+    "name": "contacts-scraper",
+    "version": "0.0.1",
+    "type": "module",
+    "description": "This is an example of a Crawlee project.",
+    "dependencies": {
+        "@packages/contact-scraper": "*",
+        "apify": "^3.1.15",
+        "crawlee": "^3.0.0"
+    },
+    "devDependencies": {
+        "@apify/tsconfig": "^0.1.0",
+        "@types/node": "^20.0.0",
+        "typescript": "~5.3.0"
+    },
+    "scripts": {
+        "start": "node --no-warnings --experimental-specifier-resolution=node --loader ../../module_loader.js src/main.ts",
+        "start:prod": "node dist/main.js",
+        "build": "tsc --build tsconfig.build.json",
+        "lint": "eslint ./src --ext .ts",
+        "lint:fix": "eslint ./src --ext .ts --fix"
+    },
+    "author": "It's not you it's me",
+    "license": "ISC"
+}
diff --git a/actors/contacts-scraper/src/main.ts b/actors/contacts-scraper/src/main.ts
@@ -0,0 +1,33 @@
+import { Actor } from 'apify';
+import { INTRO_PROMPT, JSON_SCHEMA, MODEL_NAME, MODEL_SETTINGS } from '@packages/contact-scraper';
+
+await Actor.init();
+
+// Get input of your Actor.
+const input = (await Actor.getInput()) as Record<string, any>;
+
+// Create input for apify/web-scraper
+const newInput = {
+    ...input,
+    instructions: INTRO_PROMPT,
+    useStructureOutput: true,
+    schema: JSON_SCHEMA,
+    model: MODEL_NAME,
+    ...Object.fromEntries(
+        Object.entries(
+            MODEL_SETTINGS,
+        ).map(([key, value]) => [
+            key,
+            // input schema of extended-gpt-scraper requires string values
+            value.toString(),
+        ]),
+    ),
+};
+
+// Transform the Actor run to apify/web-scraper
+// with the new input.
+await Actor.metamorph('drobnikj/extended-gpt-scraper', newInput);
+
+// The line here will never be reached, because the
+// Actor run will be interrupted.
+await Actor.exit();
diff --git a/actors/contacts-scraper/tsconfig.build.json b/actors/contacts-scraper/tsconfig.build.json
@@ -0,0 +1,16 @@
+{
+    "extends": "../../tsconfig.build.json",
+    "compilerOptions": {
+        "composite": true,
+        "rootDir": "./src",
+        "outDir": "./dist",
+        "tsBuildInfoFile": "./dist/.tsbuildinfo"
+    },
+    "references": [
+        { "path": "../../packages/gpt-scraper-core/tsconfig.build.json" },
+        { "path": "../../packages/contact-scraper/tsconfig.build.json"}
+    ],
+    "include": [
+        "src/**/*",
+    ]
+}
diff --git a/actors/contacts-scraper/tsconfig.json b/actors/contacts-scraper/tsconfig.json
@@ -0,0 +1,6 @@
+{
+    "extends": "../../tsconfig.json",
+    "include": [
+        "src/**/*",
+    ]
+}
diff --git a/actors/contacts-scraper/tsconfig.tsbuildinfo b/actors/contacts-scraper/tsconfig.tsbuildinfo