diff --git a/.github/workflows/check-pr-title.yml b/.github/workflows/check-pr-title.yml index 20f475c6b0cc..b6c07ca0aeaa 100644 --- a/.github/workflows/check-pr-title.yml +++ b/.github/workflows/check-pr-title.yml @@ -7,7 +7,7 @@ on: jobs: check_pr_title: name: 'Check PR title' - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: amannn/action-semantic-pull-request@v5.5.3 env: diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml index 59c92b4c179f..2678ba96a8f2 100644 --- a/.github/workflows/docker-images.yml +++ b/.github/workflows/docker-images.yml @@ -26,7 +26,7 @@ env: jobs: trigger_ci: name: Trigger CI - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: github.repository == 'apify/crawlee' diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 06a2d7f3d240..38af5d78683d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,7 @@ jobs: contents: write pages: write id-token: write - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4ac8898b88ab..c2509a735df0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -32,7 +32,7 @@ jobs: fail-fast: true matrix: # We don't test on Windows as the tests are flaky - os: [ ubuntu-latest ] + os: [ ubuntu-22.04 ] node-version: [ 16, 18, 20, 22 ] runs-on: ${{ matrix.os }} @@ -86,7 +86,7 @@ jobs: name: "Bump Crawlee: ${{ inputs.version }} version (${{ inputs.custom_version || 'n/a' }} custom version)" if: (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:')) needs: build_and_test - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -173,7 +173,7 @@ jobs: version-docs: needs: release - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: (github.event.inputs.version == 'minor' || github.event.inputs.version == 'major') steps: diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index c234d3a1ab91..a9032edf8fe5 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -21,8 +21,8 @@ jobs: fail-fast: false matrix: # tests on windows are extremely unstable - # os: [ ubuntu-latest, windows-2019 ] - os: [ ubuntu-latest ] + # os: [ ubuntu-22.04, windows-2019 ] + os: [ ubuntu-22.04 ] node-version: [ 16, 18, 20, 22 ] steps: @@ -83,7 +83,7 @@ jobs: docs: name: Docs build if: (!contains(github.event.head_commit.message, '[skip ci]') && github.ref != 'refs/heads/master') - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout Source code uses: actions/checkout@v4 @@ -125,7 +125,7 @@ jobs: lint: name: Lint - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -168,7 +168,7 @@ jobs: name: Release @next if: github.event_name == 'push' && contains(github.event.ref, 'master') && (!contains(github.event.head_commit.message, '[skip ci]') && !contains(github.event.head_commit.message, 'docs:')) needs: build_and_test - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 5d1f4c973e97..0a7623eb4d2a 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -13,7 +13,7 @@ jobs: # NPM install is done in a separate job and cached to speed up the following jobs. build_and_test: name: Build & Test - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: fail-fast: false diff --git a/.github/workflows/update_new_issue.yml b/.github/workflows/update_new_issue.yml index f5bd18a1d8e5..aef1901a7ac4 100644 --- a/.github/workflows/update_new_issue.yml +++ b/.github/workflows/update_new_issue.yml @@ -8,7 +8,7 @@ on: jobs: label_issues: name: Label issues - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: issues: write diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx index 50ef9b949713..8bf385f1c5b5 100644 --- a/docs/guides/proxy_management.mdx +++ b/docs/guides/proxy_management.mdx @@ -61,7 +61,72 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler All our proxy needs are managed by the `ProxyConfiguration` class. We create an instance using the `ProxyConfiguration` `constructor` function based on the provided options. See the `ProxyConfigurationOptions` for all the possible constructor options. -### Crawler integration +### Static proxy list + +You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies. + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: [ + 'http://proxy-1.com', + 'http://proxy-2.com', + null // null means no proxy is used + ] +}); +``` + +This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion. + +### Custom proxy function + +The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy. + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + newUrlFunction: (sessionId, { request }) => { + if (request?.url.includes('crawlee.dev')) { + return null; // for crawlee.dev, we don't use a proxy + } + + return 'http://proxy-1.com'; // for all other URLs, we use this proxy + } +}); +``` + +The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL. + +The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id. + +The `options` parameter is an object containing a `Request`, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not. + +### Tiered proxies + +You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website. + +:::warning + +Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). + +Using this configuration through the `newUrl` calls will not yield the expected results. + +::: + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + tieredProxyUrls: [ + [null], // At first, we try to connect without a proxy + ['http://okay-proxy.com'], + ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'], + ['http://very-good-and-expensive-proxy.com'], + ] +}); +``` + +This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL. + +Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them. + +## Crawler integration `ProxyConfiguration` integrates seamlessly into `HttpCrawler`, `CheerioCrawler`, `JSDOMCrawler`, `PlaywrightCrawler` and `PuppeteerCrawler`. @@ -95,7 +160,7 @@ All our proxy needs are managed by the `proxyConfiguration.newUrl()` allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and `SessionPool` class for more information on how keeping a real session helps us avoid blocking. diff --git a/package.json b/package.json index 4ef8097f0f00..dd2c8b993310 100644 --- a/package.json +++ b/package.json @@ -92,7 +92,7 @@ "cross-env": "^7.0.3", "deep-equal": "^2.0.5", "eslint": "^8.57.1", - "eslint-config-prettier": "^9.1.0", + "eslint-config-prettier": "^10.0.0", "express": "^4.18.1", "fs-extra": "^11.0.0", "gen-esm-wrapper": "^1.1.3", @@ -106,7 +106,7 @@ "playwright": "1.49.1", "portastic": "^1.0.1", "proxy": "^1.0.2", - "puppeteer": "23.11.1", + "puppeteer": "24.1.0", "rimraf": "^6.0.0", "tsx": "^4.4.0", "turbo": "^2.1.0", diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index f091ac212807..0c87d8b36c18 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -294,7 +294,7 @@ export interface BasicCrawlerOptions + `${count}x: ${info.at(-1)!.trim()} (${info[0]})`; + + this.log.info(`Error analysis:`, { + totalErrors: this.stats.errorTracker.total, + uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(), + mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify), + }); + } - if (this.stats.errorTracker.total !== 0) { - const prettify = ([count, info]: [number, string[]]) => `${count}x: ${info.at(-1)!.trim()} (${info[0]})`; + const client = this.config.getStorageClient(); - this.log.info(`Error analysis:`, { - totalErrors: this.stats.errorTracker.total, - uniqueErrors: this.stats.errorTracker.getUniqueErrorCount(), - mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify), - }); - } - - const client = this.config.getStorageClient(); + if (client.teardown) { + let finished = false; + setTimeout(() => { + if (!finished) { + this.log.info('Waiting for the storage to write its state to file system.'); + } + }, 1000); + await client.teardown(); + finished = true; + } - if (client.teardown) { - let finished = false; - setTimeout(() => { - if (!finished) { - this.log.info('Waiting for the storage to write its state to file system.'); - } - }, 1000); - await client.teardown(); - finished = true; + periodicLogger.stop(); + await this.setStatusMessage( + `Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${ + this.stats.state.requestsFinished + } succeeded, ${this.stats.state.requestsFailed} failed.`, + { isStatusMessageTerminal: true, level: 'INFO' }, + ); + this.running = false; + this.hasFinishedBefore = true; } - periodicLogger.stop(); - await this.setStatusMessage( - `Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${ - this.stats.state.requestsFinished - } succeeded, ${this.stats.state.requestsFailed} failed.`, - { isStatusMessageTerminal: true, level: 'INFO' }, - ); - this.running = false; - this.hasFinishedBefore = true; - return stats; } + /** + * Gracefully stops the current run of the crawler. + * + * All the tasks active at the time of calling this method will be allowed to finish. + */ + stop(message = 'The crawler has been gracefully stopped.'): void { + // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared. + this.autoscaledPool + ?.pause() + // Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks. + .then(async () => this.autoscaledPool?.abort()) + .then(() => this.log.info(message)) + .catch((err) => { + this.log.error('An error occurred when stopping the crawler:', err); + }); + } + async getRequestQueue() { if (!this.requestQueue && this.requestList) { this.log.warningOnce( diff --git a/packages/browser-pool/src/puppeteer/puppeteer-controller.ts b/packages/browser-pool/src/puppeteer/puppeteer-controller.ts index dd7621800599..031e4ed5867b 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-controller.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-controller.ts @@ -16,7 +16,7 @@ const PROCESS_KILL_TIMEOUT_MILLIS = 5000; export class PuppeteerController extends BrowserController< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 0bfc4f3f13ef..4b309f7eda45 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -17,14 +17,14 @@ const PROXY_SERVER_ARG = '--proxy-server='; export class PuppeteerPlugin extends BrowserPlugin< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { protected async _launch( launchContext: LaunchContext< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions >, @@ -190,7 +190,7 @@ export class PuppeteerPlugin extends BrowserPlugin< protected _createController(): BrowserController< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { @@ -200,7 +200,7 @@ export class PuppeteerPlugin extends BrowserPlugin< protected async _addProxyToLaunchOptions( _launchContext: LaunchContext< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions >, @@ -235,7 +235,7 @@ export class PuppeteerPlugin extends BrowserPlugin< protected _isChromiumBasedBrowser( _launchContext: LaunchContext< typeof Puppeteer, - PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions >, diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index fc6b13ee8769..3d46c30dd432 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -7,7 +7,7 @@ import type { Browser } from 'puppeteer'; /** * Apify extends the launch options of Puppeteer. * You can use any of the Puppeteer compatible - * [`LaunchOptions`](https://pptr.dev/#?product=Puppeteer&show=api-puppeteerlaunchoptions) + * [`LaunchOptions`](https://pptr.dev/api/puppeteer.launchoptions) * options by providing the `launchOptions` property. * * **Example:** @@ -27,7 +27,7 @@ import type { Browser } from 'puppeteer'; */ export interface PuppeteerLaunchContext extends BrowserLaunchContext { /** - * `puppeteer.launch` [options](https://pptr.dev/#?product=Puppeteer&version=v13.5.1&show=api-puppeteerlaunchoptions) + * `puppeteer.launch` [options](https://pptr.dev/api/puppeteer.launchoptions) */ launchOptions?: PuppeteerPlugin['launchOptions']; @@ -111,7 +111,7 @@ export class PuppeteerLauncher extends BrowserLauncher /** * Launches headless Chrome using Puppeteer pre-configured to work within the Apify platform. * The function has the same argument and the return value as `puppeteer.launch()`. - * See [Puppeteer documentation](https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions) for more details. + * See [Puppeteer documentation](https://pptr.dev/api/puppeteer.launchoptions) for more details. * * The `launchPuppeteer()` function alters the following Puppeteer options: * diff --git a/scripts/actions/docker-images/state.json b/scripts/actions/docker-images/state.json index 6e6609519ea3..6fc71b80cd62 100644 --- a/scripts/actions/docker-images/state.json +++ b/scripts/actions/docker-images/state.json @@ -7,11 +7,11 @@ "1.49.1" ], "puppeteerVersions": [ - "23.10.2", - "23.10.3", "23.10.4", "23.11.0", - "23.11.1" + "23.11.1", + "24.0.0", + "24.1.0" ], "crawleeVersion": "3.12.1" } \ No newline at end of file diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json b/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json new file mode 100644 index 000000000000..67b63ddeba6e --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json @@ -0,0 +1,7 @@ +{ + "actorSpecification": 1, + "name": "test-cheerio-stop-resume-ts", + "version": "0.0", + "buildTag": "latest", + "env": null +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json b/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json new file mode 100644 index 000000000000..20fde449cb45 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json @@ -0,0 +1,8 @@ +{ + "root": true, + "extends": "../../.eslintrc.json", + "parserOptions": { + "project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json", + "ecmaVersion": 2022 + } +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.gitignore b/test/e2e/cheerio-stop-resume-ts/actor/.gitignore new file mode 100644 index 000000000000..f2fc11c72bcc --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/.gitignore @@ -0,0 +1,11 @@ +.idea +.DS_Store +node_modules +package-lock.json +apify_storage +crawlee_storage +storage +main.d.ts +main.d.ts.map +main.js +main.js.map diff --git a/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile b/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile new file mode 100644 index 000000000000..59ba4ae8b5e8 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile @@ -0,0 +1,28 @@ +# using multistage build, as we need dev deps to build the TS source code +FROM apify/actor-node:20-beta AS builder + +# copy all files, install all dependencies (including dev deps) and build the project +COPY . ./ +RUN npm install --include=dev \ + && npm run build + +# create final image +FROM apify/actor-node:20-beta +# copy only necessary files +COPY --from=builder /usr/src/app/packages ./packages +COPY --from=builder /usr/src/app/package.json ./ +COPY --from=builder /usr/src/app/main.js ./ + +# install only prod deps +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional --no-audit \ + && npm update --no-audit \ + && echo "Installed NPM packages:" \ + && (npm list --only=prod --no-optional --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# run compiled code +CMD npm run start:prod diff --git a/test/e2e/cheerio-stop-resume-ts/actor/main.ts b/test/e2e/cheerio-stop-resume-ts/actor/main.ts new file mode 100644 index 000000000000..8f14b3068168 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/main.ts @@ -0,0 +1,31 @@ +import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; +import { Actor } from 'apify'; + +if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') { + // @ts-ignore + await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() }); +} else { + await Actor.init(); +} + +let requestCount = 0; + +const crawler = new CheerioCrawler(); +crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => { + const { url } = request; + await enqueueLinks({ + globs: ['https://crawlee.dev/docs/**'], + }); + + const pageTitle = $('title').first().text(); + log.info(`URL: ${url} TITLE: ${pageTitle}`); + await Dataset.pushData({ url, pageTitle }); + + if (requestCount++ > 10) crawler.stop(); +}); + +await crawler.run(['https://crawlee.dev/docs/quick-start']); + +requestCount = 0; +await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false }); +await Actor.exit({ exit: Actor.isAtHome() }); diff --git a/test/e2e/cheerio-stop-resume-ts/actor/package.json b/test/e2e/cheerio-stop-resume-ts/actor/package.json new file mode 100644 index 000000000000..cf307b836523 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/package.json @@ -0,0 +1,35 @@ +{ + "name": "test-cheerio-stop-resume-ts", + "version": "0.0.1", + "description": "Crawler Stop-Resume Test - TypeScript", + "dependencies": { + "apify": "next", + "@apify/storage-local": "^2.1.3", + "@crawlee/basic": "file:./packages/basic-crawler", + "@crawlee/browser-pool": "file:./packages/browser-pool", + "@crawlee/http": "file:./packages/http-crawler", + "@crawlee/cheerio": "file:./packages/cheerio-crawler", + "@crawlee/core": "file:./packages/core", + "@crawlee/memory-storage": "file:./packages/memory-storage", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + }, + "overrides": { + "apify": { + "@crawlee/core": "file:./packages/core", + "@crawlee/types": "file:./packages/types", + "@crawlee/utils": "file:./packages/utils" + } + }, + "devDependencies": { + "@apify/tsconfig": "^0.1.0", + "typescript": "^5.0.0" + }, + "scripts": { + "start": "tsc && node main.js", + "start:prod": "node main.js", + "build": "tsc" + }, + "type": "module", + "license": "ISC" +} diff --git a/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json b/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json new file mode 100644 index 000000000000..7a212668d291 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "module": "ES2022", + "target": "ES2022", + "lib": ["DOM"] + }, + "include": ["./**/*.ts"] +} diff --git a/test/e2e/cheerio-stop-resume-ts/test.mjs b/test/e2e/cheerio-stop-resume-ts/test.mjs new file mode 100644 index 000000000000..b118f15ad612 --- /dev/null +++ b/test/e2e/cheerio-stop-resume-ts/test.mjs @@ -0,0 +1,12 @@ +import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs'; + +const testActorDirname = getActorTestDir(import.meta.url); +await initialize(testActorDirname); + +const { stats, datasetItems } = await runActor(testActorDirname); + +/// Some extra requests are expected (at most 10 extra for each run). +await expect(stats.requestsFinished < 40, 'crawler.stop() works'); + +const visitedUrls = new Set(datasetItems.map((x) => x.url)); +await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works'); diff --git a/test/e2e/linkedom-default-ts/actor/main.ts b/test/e2e/linkedom-default-ts/actor/main.ts index 95008ddb630c..ca97877d28ca 100644 --- a/test/e2e/linkedom-default-ts/actor/main.ts +++ b/test/e2e/linkedom-default-ts/actor/main.ts @@ -18,7 +18,7 @@ crawler.router.addDefaultHandler(async ({ document, enqueueLinks, request, log } globs: ['https://crawlee.dev/docs/**'], }); - const pageTitle = document.title; + const pageTitle = document.querySelector('title')?.textContent ?? ''; assert.notEqual(pageTitle, ''); log.info(`URL: ${url} TITLE: ${pageTitle}`); diff --git a/website/blog/2024/02-22-launching-crawlee-blog/index.md b/website/blog/2024/02-22-launching-crawlee-blog/index.md index 0096f8af0c6a..0806c5ab00cc 100644 --- a/website/blog/2024/02-22-launching-crawlee-blog/index.md +++ b/website/blog/2024/02-22-launching-crawlee-blog/index.md @@ -14,7 +14,7 @@ We launched Crawlee, the successor to our Apify SDK, in [August 2022](https://bl Since then, our dev community has grown exponentially. I’m proud to tell you that we have **over 11,500 Stars on GitHub**, over **6,000 community members on our Discord**, and over **125,000 downloads monthly on npm**. We’re now the most popular web scraping and automation library for Node.js developers 👏 - + ## Changes in Crawlee since the launch diff --git a/website/blog/2024/03-27-how-to-scrape-amazon-using-typescript-cheerio-and-crawlee/index.md b/website/blog/2024/03-27-how-to-scrape-amazon-using-typescript-cheerio-and-crawlee/index.md index dc47f9d2cbc8..6e67f2e04334 100644 --- a/website/blog/2024/03-27-how-to-scrape-amazon-using-typescript-cheerio-and-crawlee/index.md +++ b/website/blog/2024/03-27-how-to-scrape-amazon-using-typescript-cheerio-and-crawlee/index.md @@ -14,7 +14,7 @@ In this guide, we'll be extracting information from Amazon product pages using t ![How to scrape Amazon using Typescript, Cheerio, and Crawlee](./img/how-to-scrape-amazon.webp) - + ## Prerequisites diff --git a/website/blog/2024/04-23-scrapy-vs-crawlee/index.md b/website/blog/2024/04-23-scrapy-vs-crawlee/index.md index a89b43c14d71..107c66853629 100644 --- a/website/blog/2024/04-23-scrapy-vs-crawlee/index.md +++ b/website/blog/2024/04-23-scrapy-vs-crawlee/index.md @@ -19,7 +19,7 @@ Welcome to another post on the Crawlee blog; this time, we are going to compare Crawlee is also an open-source library that originated as [Apify SDK](https://docs.apify.com/sdk/js/). Crawlee has the advantage of being the latest library in the market, so it already has many features that Scrapy lacks, like autoscaling, headless browsing, working with JavaScript rendered websites without any plugins, and many more, which we are going to explain later on. - + ## Feature comparison diff --git a/website/blog/2024/06-24-proxy-management-in-crawlee/index.md b/website/blog/2024/06-24-proxy-management-in-crawlee/index.md index 65a96f6f50b8..05fe2061106d 100644 --- a/website/blog/2024/06-24-proxy-management-in-crawlee/index.md +++ b/website/blog/2024/06-24-proxy-management-in-crawlee/index.md @@ -15,7 +15,7 @@ Proxies vary in quality, speed, reliability, and cost. There are a [few types of It is hard for developers to decide which proxy to use while scraping data. We might get blocked if we use [datacenter proxies](https://blog.apify.com/datacenter-proxies-when-to-use-them-and-how-to-make-the-most-of-them/) for low-cost scraping, but residential proxies are sometimes too expensive for bigger projects. Developers need a system that can manage both costs and avoid getting blocked. To manage this, we recently introduced tiered proxies in Crawlee. Let’s take a look at it. - + :::note diff --git a/website/blog/2024/07-05-launching-crawlee-python/index.md b/website/blog/2024/07-05-launching-crawlee-python/index.md index c3308e45137f..dd5439759b1f 100644 --- a/website/blog/2024/07-05-launching-crawlee-python/index.md +++ b/website/blog/2024/07-05-launching-crawlee-python/index.md @@ -26,7 +26,7 @@ The new library is still in **beta**, and we are looking for **early adopters**. Crawlee for Python has some amazing initial features, such as a unified interface for HTTP and headless browser crawling, automatic retries, and much more. - + ## Why use Crawlee instead of a random HTTP library with an HTML parser? diff --git a/website/blog/2024/08-27-how-to-scrape-infinite-scrolling-pages/index.md b/website/blog/2024/08-27-how-to-scrape-infinite-scrolling-pages/index.md index 3e194d9a97e6..e12747a12b9d 100644 --- a/website/blog/2024/08-27-how-to-scrape-infinite-scrolling-pages/index.md +++ b/website/blog/2024/08-27-how-to-scrape-infinite-scrolling-pages/index.md @@ -18,7 +18,7 @@ As a big sneakerhead, I'll take the Nike shoes infinite-scrolling [website](http Crawlee for Python has some amazing initial features, such as a unified interface for HTTP and headless browser crawling, automatic retries, and much more. - + ## Prerequisites and bootstrapping the project diff --git a/website/blog/2024/09-30-jsdom-based-scraping/index.md b/website/blog/2024/09-30-jsdom-based-scraping/index.md index db8cbe4541c1..7c6671bbc699 100644 --- a/website/blog/2024/09-30-jsdom-based-scraping/index.md +++ b/website/blog/2024/09-30-jsdom-based-scraping/index.md @@ -14,7 +14,7 @@ This article will discuss a new approach that we use in one of our Actors to obt ![JSDOM based approach from scraping](./img/jsdom.webp) - + ## Analyzing the website diff --git a/website/blog/2024/10-14-linkedin-job-scraper-python/index.md b/website/blog/2024/10-14-linkedin-job-scraper-python/index.md index 35f48e628112..83a90fea24b3 100644 --- a/website/blog/2024/10-14-linkedin-job-scraper-python/index.md +++ b/website/blog/2024/10-14-linkedin-job-scraper-python/index.md @@ -26,7 +26,7 @@ By the end of this tutorial, you’ll have a fully functional web application th Let's begin. - + ## Prerequisites diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/cloudflare_link.webp b/website/blog/2025/01-03-scrape-crunchbase/img/cloudflare_link.webp new file mode 100644 index 000000000000..f24d46b4009c Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/cloudflare_link.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/data_json.webp b/website/blog/2025/01-03-scrape-crunchbase/img/data_json.webp new file mode 100644 index 000000000000..c2267e2ca9ff Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/data_json.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/scrape_crunchbase.webp b/website/blog/2025/01-03-scrape-crunchbase/img/scrape_crunchbase.webp new file mode 100644 index 000000000000..685afc988975 Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/scrape_crunchbase.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/search_protect.webp b/website/blog/2025/01-03-scrape-crunchbase/img/search_protect.webp new file mode 100644 index 000000000000..37cef562dc4c Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/search_protect.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_one.webp b/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_one.webp new file mode 100644 index 000000000000..32ddd79fde45 Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_one.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_two.webp b/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_two.webp new file mode 100644 index 000000000000..ea547d390858 Binary files /dev/null and b/website/blog/2025/01-03-scrape-crunchbase/img/sitemap_lvl_two.webp differ diff --git a/website/blog/2025/01-03-scrape-crunchbase/index.md b/website/blog/2025/01-03-scrape-crunchbase/index.md new file mode 100644 index 000000000000..a84082fecf10 --- /dev/null +++ b/website/blog/2025/01-03-scrape-crunchbase/index.md @@ -0,0 +1,546 @@ +--- +slug: scrape-crunchbase-python +title: 'How to scrape Crunchbase using Python in 2024 (Easy Guide)' +tags: [community] +description: 'Learn how to scrape Crunchbase using Crawlee for Python' +image: "./img/scrape_crunchbase.webp" +authors: [MaxB] +--- + +Python developers know the drill: you need reliable company data, and Crunchbase has it. This guide shows you how to build an effective [Crunchbase](https://www.crunchbase.com/) scraper in Python that gets you the data you need. + +Crunchbase tracks details that matter: locations, business focus, founders, and investment histories. Manual extraction from such a large dataset isn't practical -automation is essential for transforming this information into an analyzable format. + +By the end of this blog, we'll explore three different ways to extract data from Crunchbase using [`Crawlee for Python`](https://github.com/apify/crawlee-python). We'll fully implement two of them and discuss the specifics and challenges of the third. This will help us better understand how important it is to properly [choose the right data source](https://www.crawlee.dev/blog/web-scraping-tips#1-choosing-a-data-source-for-the-project). + +:::note + +This guide comes from a developer in our growing community. Have you built interesting projects with Crawlee? Join us on [Discord](https://discord.com/invite/jyEM2PRvMU) to share your experiences and blog ideas - we value these contributions from developers like you. + +::: + +![How to Scrape Crunchbase Using Python](./img/scrape_crunchbase.webp) + +Key steps we'll cover: + +1. Project setup +2. Choosing the data source +3. Implementing sitemap-based crawler +4. Analysis of search-based approach and its limitations +5. Implementing the official API crawler +6. Conclusion and repository access + + + +## Prerequisites + +- Python 3.9 or higher +- Familiarity with web scraping concepts +- Crawlee for Python `v0.5.0` +- poetry `v2.0` or higher + +### Project setup + +Before we start scraping, we need to set up our project. In this guide, we won't be using crawler templates (`Playwright` and `Beautifulsoup`), so we'll set up the project manually. + +1. Install [`Poetry`](https://python-poetry.org/) + + ```bash + pipx install poetry + ``` + +2. Create and navigate to the project folder. + + ```bash + mkdir crunchbase-crawlee && cd crunchbase-crawlee + ``` + +3. Initialize the project using Poetry, leaving all fields empty. + + ```bash + poetry init + ``` + + When prompted: + - For "Compatible Python versions", enter: `>={your Python version},<4.0` + (For example, if you're using Python 3.10, enter: `>=3.10,<4.0`) + - Leave all other fields empty by pressing Enter + - Confirm the generation by typing "yes" + +4. Add and install Crawlee with necessary dependencies to your project using `Poetry.` + + ```bash + poetry add crawlee[parsel,curl-impersonate] + ``` + +5. Complete the project setup by creating the standard file structure for `Crawlee for Python` projects. + + ```bash + mkdir crunchbase-crawlee && touch crunchbase-crawlee/{__init__.py,__main__.py,main.py,routes.py} + ``` + +After setting up the basic project structure, we can explore different methods of obtaining data from Crunchbase. + +### Choosing the data source + +While we can extract target data directly from the [company page](https://www.crunchbase.com/organization/apify), we need to choose the best way to navigate the site. + +A careful examination of Crunchbase's structure shows that we have three main options for obtaining data: + +1. [`Sitemap`](https://www.crunchbase.com/www-sitemaps/sitemap-index.xml) - for complete site traversal. +2. [`Search`](https://www.crunchbase.com/discover/organization.companies) - for targeted data collection. +3. [Official API](https://data.crunchbase.com/v4-legacy/docs/crunchbase-basic-getting-started) - recommended method. + +Let's examine each of these approaches in detail. + +## Scraping Crunchbase using sitemap and Crawlee for Python + +`Sitemap` is a standard way of site navigation used by crawlers like [`Google`](https://google.com/), [`Ahrefs`](https://ahrefs.com/), and other search engines. All crawlers must follow the rules described in [`robots.txt`](https://www.crunchbase.com/robots.txt). + +Let's look at the structure of Crunchbase's Sitemap: + +![Sitemap first lvl](./img/sitemap_lvl_one.webp) + +As you can see, links to organization pages are located inside second-level `Sitemap` files, which are compressed using `gzip`. + +The structure of one of these files looks like this: + +![Sitemap second lvl](./img/sitemap_lvl_two.webp) + +The `lastmod` field is particularly important here. It allows tracking which companies have updated their information since the previous data collection. This is especially useful for regular data updates. + +### 1. Configuring the crawler for scraping + +To work with the site, we'll use [`CurlImpersonateHttpClient`](https://www.crawlee.dev/python/api/class/CurlImpersonateHttpClient), which impersonates a `Safari` browser. While this choice might seem unexpected for working with a sitemap, it's necessitated by Crunchbase's protection features. + +The reason is that Crunchbase uses [Cloudflare](https://www.cloudflare.com/) to protect against automated access. This is clearly visible when analyzing traffic on a company page: + +![Cloudflare Link](./img/cloudflare_link.webp) + +An interesting feature is that `challenges.cloudflare` is executed after loading the document with data. This means we receive the data first, and only then JavaScript checks if we're a bot. If our HTTP client's fingerprint is sufficiently similar to a real browser, we'll successfully receive the data. + +Cloudflare [also analyzes traffic at the sitemap level](https://developers.cloudflare.com/waf/custom-rules/use-cases/allow-traffic-from-verified-bots/). If our crawler doesn't look legitimate, access will be blocked. That's why we impersonate a real browser. + +To prevent blocks due to overly aggressive crawling, we'll configure [`ConcurrencySettings`](https://www.crawlee.dev/python/api/class/ConcurrencySettings). + +When scaling this approach, you'll likely need proxies. Detailed information about proxy setup can be found in the [documentation](https://www.crawlee.dev/python/docs/guides/proxy-management). + +We'll save our scraping results in `JSON` format. Here's how the basic crawler configuration looks: + +```python +# main.py + +from crawlee import ConcurrencySettings, HttpHeaders +from crawlee.crawlers import ParselCrawler +from crawlee.http_clients import CurlImpersonateHttpClient + +from .routes import router + + +async def main() -> None: + """The crawler entry point.""" + concurrency_settings = ConcurrencySettings(max_concurrency=1, max_tasks_per_minute=50) + + http_client = CurlImpersonateHttpClient( + impersonate='safari17_0', + headers=HttpHeaders( + { + 'accept-language': 'en', + 'accept-encoding': 'gzip, deflate, br, zstd', + } + ), + ) + crawler = ParselCrawler( + request_handler=router, + max_request_retries=1, + concurrency_settings=concurrency_settings, + http_client=http_client, + max_requests_per_crawl=30, + ) + + await crawler.run(['https://www.crunchbase.com/www-sitemaps/sitemap-index.xml']) + + await crawler.export_data_json('crunchbase_data.json') +``` + +### 2. Implementing sitemap navigation + +Sitemap navigation happens in two stages. In the first stage, we need to get a list of all files containing organization information: + +```python +# routes.py + +from crawlee.crawlers import ParselCrawlingContext +from crawlee.router import Router +from crawlee import Request + +router = Router[ParselCrawlingContext]() + + +@router.default_handler +async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'default_handler processing {context.request} ...') + + requests = [ + Request.from_url(url, label='sitemap') + for url in context.selector.xpath('//loc[contains(., "sitemap-organizations")]/text()').getall() + ] + + # Since this is a tutorial, I don't want to upload more than one sitemap link + await context.add_requests(requests, limit=1) +``` + +In the second stage, we process second-level sitemap files stored in `gzip` format. This requires a special approach as the data needs to be decompressed first: + +```python +# routes.py + +from gzip import decompress +from parsel import Selector + + +@router.handler('sitemap') +async def sitemap_handler(context: ParselCrawlingContext) -> None: + """Sitemap gzip request handler.""" + context.log.info(f'sitemap_handler processing {context.request.url} ...') + + data = context.http_response.read() + data = decompress(data) + + selector = Selector(data.decode()) + + requests = [Request.from_url(url, label='company') for url in selector.xpath('//loc/text()').getall()] + + await context.add_requests(requests) +``` + +### 3. Extracting and saving data + +Each company page contains a large amount of information. For demonstration purposes, we'll focus on the main fields: `Company Name`, `Short Description`, `Website`, and `Location`. + +One of Crunchbase's advantages is that all data is stored in `JSON` format within the page: + +![Company Data](./img/data_json.webp) + +This significantly simplifies data extraction - we only need to use one `Xpath` selector to get the `JSON`, and then apply [`jmespath`](https://jmespath.org/) to extract the needed fields: + +```python +# routes.py + +@router.handler('company') +async def company_handler(context: ParselCrawlingContext) -> None: + """Company request handler.""" + context.log.info(f'company_handler processing {context.request.url} ...') + + json_selector = context.selector.xpath('//*[@id="ng-state"]/text()') + + await context.push_data( + { + 'Company Name': json_selector.jmespath('HttpState.*.data[].properties.identifier.value').get(), + 'Short Description': json_selector.jmespath('HttpState.*.data[].properties.short_description').get(), + 'Website': json_selector.jmespath('HttpState.*.data[].cards.company_about_fields2.website.value').get(), + 'Location': '; '.join( + json_selector.jmespath( + 'HttpState.*.data[].cards.company_about_fields2.location_identifiers[].value' + ).getall() + ), + } + ) +``` + +The collected data is saved in `Crawlee for Python`'s internal storage using the `context.push_data` method. When the crawler finishes, we export all collected data to a JSON file: + +```python +# main.py + +await crawler.export_data_json('crunchbase_data.json') +``` + +### 4. Running the project + +With all components in place, we need to create an entry point for our crawler: + +```python +# __main__.py +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) +``` + +Execute the crawler using Poetry: + +```bash +poetry run python -m crunchbase-crawlee +``` + +### 5. Finally, characteristics of using the sitemap crawler + +The sitemap approach has its distinct advantages and limitations. It's ideal in the following cases: + +- When you need to collect data about all companies on the platform +- When there are no specific company selection criteria +- If you have sufficient time and computational resources + +However, there are significant limitations to consider: + +- Almost no ability to filter data during collection +- Requires constant monitoring of Cloudflare blocks +- Scaling the solution requires proxy servers, which increases project costs + +## Using search for scraping Crunchbase + +The limitations of the sitemap approach might point to search as the next solution. However, Crunchbase applies tighter security measures to its search functionality compared to its public pages. + +The key difference lies in how Cloudflare protection works. While we receive data before the `challenges.cloudflare` check when accessing a company page, the search API requires valid `cookies` that have passed this check. + +Let's verify this in practice. Open the following link in Incognito mode: + +```plaintext + +``` + +When analyzing the traffic, we'll see the following pattern: + +![Search Protect](./img/search_protect.webp) + +The sequence of events here is: + +1. First, the page is blocked with code `403` +2. Then the `challenges.cloudflare` check is performed +3. Only after successfully passing the check do we receive data with code `200` + +Automating this process would require a `headless` browser capable of bypassing [`Cloudflare Turnstile`](https://www.cloudflare.com/application-services/products/turnstile/). The current version of `Crawlee for Python` (v0.5.0) doesn't provide this functionality, although it's planned for future development. + +You can extend the capabilities of Crawlee for Python by integrating [`Camoufox`](https://camoufox.com/) following this [example.](https://www.crawlee.dev/python/docs/examples/playwright-crawler-with-camoufox) + +## Working with the official Crunchbase API + +Crunchbase provides a [free API](https://data.crunchbase.com/v4-legacy/docs/crunchbase-basic-using-api) with basic functionality. Paid subscription users get expanded data access. Complete documentation for available endpoints can be found in the [official API specification](https://app.swaggerhub.com/apis-docs/Crunchbase/crunchbase-enterprise_api). + +### 1. Setting up API access + +To start working with the API, follow these steps: + +1. [Create a Crunchbase account](https://www.crunchbase.com/register) +2. Go to the Integrations section +3. Create a Crunchbase Basic API key + +Although the documentation states that key activation may take up to an hour, it usually starts working immediately after creation. + +### 2. Configuring the crawler for API work + +An important API feature is the limit - no more than 200 requests per minute, but in the free version, this number is significantly lower. Taking this into account, let's configure [`ConcurrencySettings`](https://www.crawlee.dev/python/api/class/ConcurrencySettings). Since we're working with the official API, we don't need to mask our HTTP client. We'll use the standard ['HttpxHttpClient'](https://www.crawlee.dev/python/api/class/HttpxHttpClient) with preset headers. + +First, let's save the API key in an environment variable: + +```bash +export CRUNCHBASE_TOKEN={YOUR KEY} +``` + +Here's how the crawler configuration for working with the API looks: + +```python +# main.py + +import os + +from crawlee.crawlers import HttpCrawler +from crawlee.http_clients import HttpxHttpClient +from crawlee import ConcurrencySettings, HttpHeaders + +from .routes import router + +CRUNCHBASE_TOKEN = os.getenv('CRUNCHBASE_TOKEN', '') + + +async def main() -> None: + """The crawler entry point.""" + + concurrency_settings = ConcurrencySettings(max_tasks_per_minute=60) + + http_client = HttpxHttpClient( + headers=HttpHeaders({'accept-encoding': 'gzip, deflate, br, zstd', 'X-cb-user-key': CRUNCHBASE_TOKEN}) + ) + crawler = HttpCrawler( + request_handler=router, + concurrency_settings=concurrency_settings, + http_client=http_client, + max_requests_per_crawl=30, + ) + + await crawler.run( + ['https://api.crunchbase.com/api/v4/autocompletes?query=apify&collection_ids=organizations&limit=25'] + ) + + await crawler.export_data_json('crunchbase_data.json') +``` + +### 3. Processing search results + +For working with the API, we'll need two main endpoints: + +1. [get_autocompletes](https://app.swaggerhub.com/apis-docs/Crunchbase/crunchbase-enterprise_api/1.0.3#/Autocomplete/get_autocompletes) - for searching +2. [get_entities_organizations__entity_id](https://app.swaggerhub.com/apis-docs/Crunchbase/crunchbase-enterprise_api/1.0.3#/Entity/get_entities_organizations__entity_id_) - for getting data + +First, let's implement search results processing: + +```python +import json + +from crawlee.crawlers import HttpCrawler +from crawlee.router import Router +from crawlee import Request + +router = Router[HttpCrawlingContext]() + + +@router.default_handler +async def default_handler(context: HttpCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'default_handler processing {context.request.url} ...') + + data = json.loads(context.http_response.read()) + + requests = [] + + for entity in data['entities']: + permalink = entity['identifier']['permalink'] + requests.append( + Request.from_url( + url=f'https://api.crunchbase.com/api/v4/entities/organizations/{permalink}?field_ids=short_description%2Clocation_identifiers%2Cwebsite_url', + label='company', + ) + ) + + await context.add_requests(requests) +``` + +### 4. Extracting company data + +After getting the list of companies, we extract detailed information about each one: + +```python +@router.handler('company') +async def company_handler(context: HttpCrawlingContext) -> None: + """Company request handler.""" + context.log.info(f'company_handler processing {context.request.url} ...') + + data = json.loads(context.http_response.read()) + + await context.push_data( + { + 'Company Name': data['properties']['identifier']['value'], + 'Short Description': data['properties']['short_description'], + 'Website': data['properties'].get('website_url'), + 'Location': '; '.join([item['value'] for item in data['properties'].get('location_identifiers', [])]), + } + ) +``` + +### 5. Advanced location-based search + +If you need more flexible search capabilities, the API provides a special [`search`](https://app.swaggerhub.com/apis-docs/Crunchbase/crunchbase-enterprise_api/1.0.3#/Search/post_searches_organizations) endpoint. Here's an example of searching for all companies in Prague: + +```python +payload = { + 'field_ids': ['identifier', 'location_identifiers', 'short_description', 'website_url'], + 'limit': 200, + 'order': [{'field_id': 'rank_org', 'sort': 'asc'}], + 'query': [ + { + 'field_id': 'location_identifiers', + 'operator_id': 'includes', + 'type': 'predicate', + 'values': ['e0b951dc-f710-8754-ddde-5ef04dddd9f8'], + }, + {'field_id': 'facet_ids', 'operator_id': 'includes', 'type': 'predicate', 'values': ['company']}, + ], +} + +serialiazed_payload = json.dumps(payload) +await crawler.run( + [ + Request.from_url( + url='https://api.crunchbase.com/api/v4/searches/organizations', + method='POST', + payload=serialiazed_payload, + use_extended_unique_key=True, + headers=HttpHeaders({'Content-Type': 'application/json'}), + label='search', + ) + ] +) +``` + +For processing search results and pagination, we use the following handler: + +```python +@router.handler('search') +async def search_handler(context: HttpCrawlingContext) -> None: + """Search results handler with pagination support.""" + context.log.info(f'search_handler processing {context.request.url} ...') + + data = json.loads(context.http_response.read()) + + last_entity = None + results = [] + + for entity in data['entities']: + last_entity = entity['uuid'] + results.append( + { + 'Company Name': entity['properties']['identifier']['value'], + 'Short Description': entity['properties']['short_description'], + 'Website': entity['properties'].get('website_url'), + 'Location': '; '.join([item['value'] for item in entity['properties'].get('location_identifiers', [])]), + } + ) + + if results: + await context.push_data(results) + + if last_entity: + payload = json.loads(context.request.payload) + payload['after_id'] = last_entity + payload = json.dumps(payload) + + await context.add_requests( + [ + Request.from_url( + url='https://api.crunchbase.com/api/v4/searches/organizations', + method='POST', + payload=payload, + use_extended_unique_key=True, + headers=HttpHeaders({'Content-Type': 'application/json'}), + label='search', + ) + ] + ) +``` + +### 6. Finally, free API limitations + +The free version of the API has significant limitations: + +- Limited set of available endpoints +- Autocompletes function only works for company searches +- Not all data fields are accessible +- Limited search filtering capabilities + +Consider a paid subscription for production-level work. The API provides the most reliable way to access Crunchbase data, even with its rate constraints. + +## What’s your best path forward? + +We've explored three different approaches to obtaining data from Crunchbase: + +1. **Sitemap** - for large-scale data collection +2. **Search** - difficult to automate due to Cloudflare protection +3. **Official API** - the most reliable solution for commercial projects + +Each method has its advantages, but for most projects, I recommend using the official API despite its limitations in the free version. + +The complete source code is available in my [repository](https://github.com/Mantisus/crunchbase-crawlee). Have questions or want to discuss implementation details? Join our [Discord](https://discord.com/invite/jyEM2PRvMU) - our community of developers is there to help. diff --git a/website/blog/2025/01-10/img/import_crawlers.webp b/website/blog/2025/01-10/img/import_crawlers.webp new file mode 100644 index 000000000000..adfc64c26842 Binary files /dev/null and b/website/blog/2025/01-10/img/import_crawlers.webp differ diff --git a/website/blog/2025/01-10/index.md b/website/blog/2025/01-10/index.md new file mode 100644 index 000000000000..b839e8988f13 --- /dev/null +++ b/website/blog/2025/01-10/index.md @@ -0,0 +1,268 @@ +--- +slug: crawlee-for-python-v05 +title: Crawlee for Python v0.5 +description: Announcing the Crawlee for Python v0.5 release. +authors: [VladaD] +--- + +Crawlee for Python v0.5 is now available! This is our biggest release to date, bringing new ported functionality from the [Crawlee for JavaScript](https://github.com/apify/crawlee), brand-new features that are exclusive to the Python library (for now), a new consolidated package structure, and a bunch of bug fixes and further improvements. + + + +## Getting started + +You can upgrade to the latest version straight from [PyPI](https://pypi.org/project/crawlee/): + +```shell +pip install --upgrade crawlee +``` + +Check out the full changelog on our [website](https://www.crawlee.dev/python/docs/changelog#050-2025-01-02) to see all the details. If you are updating from an older version, make sure to follow our [Upgrading to v0.5](https://www.crawlee.dev/python/docs/upgrading/upgrading-to-v0x#upgrading-to-v05) guide for a smooth upgrade. + +## New package structure + +We have introduced a new consolidated package structure. The goal is to streamline the development experience, help you find the crawlers you are looking for faster, and improve the IDE's code suggestions while importing. + +### Crawlers + +We have grouped all crawler classes (and their corresponding crawling context classes) into a single sub-package called `crawlers`. Here is a quick example of how the imports have changed: + +```diff +- from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext ++ from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +``` + +Look how you can see all the crawlers that we have, isn't that cool! + +![Import from crawlers subpackage.](./img/import_crawlers.webp) + +### Storage clients + +Similarly, we have moved all storage client classes under `storage_clients` sub-package. For instance: + +```diff +- from crawlee.memory_storage_client import MemoryStorageClient ++ from crawlee.storage_clients import MemoryStorageClient +``` + +This consolidation makes it clearer where each class belongs and ensures that your IDE can provide better autocompletion when you are looking for the right crawler or storage client. + +## Continued parity with Crawlee JS + +We are constantly working toward feature parity with our JavaScript library, [Crawlee JS](https://github.com/apify/crawlee). With v0.5, we have brought over more functionality: + +### HTML to text context helper + +The `html_to_text` crawling context helper simplifies extracting text from an HTML page by automatically removing all tags and returning only the raw text content. It's available in the [`ParselCrawlingContext`](https://www.crawlee.dev/python/api/class/ParselCrawlingContext#html_to_text) and [`BeautifulSoupCrawlingContext`](https://www.crawlee.dev/python/api/class/BeautifulSoupCrawlingContext#html_to_text). + +```python +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + crawler = ParselCrawler() + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info('Crawling: %s', context.request.url) + text = context.html_to_text() + # Continue with the processing... + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +In this example, we use a [`ParselCrawler`](https://www.crawlee.dev/python/api/class/ParselCrawler) to fetch a webpage, then invoke `context.html_to_text()` to extract clean text for further processing. + +### Use state + +The [`use_state`](https://www.crawlee.dev/python/api/class/UseStateFunction) crawling context helper makes it simple to create and manage persistent state values within your crawler. It ensures that all state values are automatically persisted. It enables you to maintain data across different crawler runs, restarts, and failures. It acts as a convenient abstraction for interaction with [`KeyValueStore`](https://www.crawlee.dev/python/api/class/KeyValueStore). + +```python +import asyncio + +from crawlee import Request +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + # Create a crawler with purge_on_start disabled to retain state across runs. + crawler = ParselCrawler( + configuration=Configuration(purge_on_start=False), + ) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Crawling {context.request.url}') + + # Retrieve or initialize the state with a default value. + state = await context.use_state('state', default_value={'runs': 0}) + + # Increment the run count. + state['runs'] += 1 + + # Create a request with always_enqueue enabled to bypass deduplication and ensure it is processed. + request = Request.from_url('https://crawlee.dev/', always_enqueue=True) + + # Run the crawler with the start request. + await crawler.run([request]) + + # Fetch the persisted state from the key-value store. + kvs = await crawler.get_key_value_store() + state = await kvs.get_auto_saved_value('state') + crawler.log.info(f'Final state after run: {state}') + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +Please note that the `use_state` is an experimental feature. Its behavior and interface may evolve in future versions. + +## Brand new features + +In addition to porting features from JS, we are introducing new, Python-first functionalities that will eventually make their way into Crawlee JS in the coming months. + +### Crawler's stop method + +The [`BasicCrawler`](https://www.crawlee.dev/python/api/class/BasicCrawler), and by extension, all crawlers that inherit from it, now has a [`stop`](https://www.crawlee.dev/python/api/class/BasicCrawler#stop) method. This makes it easy to halt the crawling when a specific condition is met, for instance, if you have found the data you were looking for. + +```python +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +async def main() -> None: + crawler = ParselCrawler() + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info('Crawling: %s', context.request.url) + + # Extract and enqueue links from the page. + await context.enqueue_links() + + title = context.selector.css('title::text').get() + + # Condition when you want to stop the crawler, e.g. you + # have found what you were looking for. + if 'Crawlee for Python' in title: + context.log.info('Condition met, stopping the crawler.') + await crawler.stop() + + await crawler.run(['https://crawlee.dev']) + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +### Request loaders + +There are new classes [`RequestLoader`](https://www.crawlee.dev/python/api/class/RequestLoader), [`RequestManager`](https://www.crawlee.dev/python/api/class/RequestManager) and [`RequestManagerTandem`](https://www.crawlee.dev/python/api/class/RequestManagerTandem) that manage how Crawlee accesses and stores requests. They allow you to use other component (service) as a source for requests and optionally you can combine it with a [`RequestQueue`](https://www.crawlee.dev/python/api/class/RequestQueue). They let you plug in any request source, and combine the external data sources with Crawlee's standard `RequestQueue`. + +You can learn more about these new features in the [Request loaders guide](https://www.crawlee.dev/python/docs/guides/request-loaders). + +```python +import asyncio + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.request_loaders import RequestList, RequestManagerTandem +from crawlee.storages import RequestQueue + + +async def main() -> None: + rl = RequestList( + [ + 'https://crawlee.dev', + 'https://apify.com', + # Long list of URLs... + ], + ) + + rq = await RequestQueue.open() + + # Combine them into a single request source. + tandem = RequestManagerTandem(rl, rq) + + crawler = ParselCrawler(request_manager=tandem) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Crawling {context.request.url}') + # ... + + await crawler.run() + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +In this example we combine a [`RequestList`](https://www.crawlee.dev/python/api/class/RequestList) with a [`RequestQueue`](https://www.crawlee.dev/python/api/class/RequestQueue). However, instead of the `RequestList` you can use any other class that implements the [`RequestLoader`](https://www.crawlee.dev/python/api/class/RequestLoader) interface to suit your specific requirements. + +### Service locator + +The [`ServiceLocator`](https://www.crawlee.dev/python/api/class/ServiceLocator) is primarily an internal mechanism for managing the services that Crawlee depends on. Specifically, the [`Configuration`](https://www.crawlee.dev/python/api/class/ServiceLocator), [`StorageClient`](https://www.crawlee.dev/python/api/class/ServiceLocator), and [`EventManager`](https://www.crawlee.dev/python/api/class/ServiceLocator). By swapping out these components, you can adapt Crawlee to suit different runtime environments. + +You can use the service locator explicitly: + +```python +import asyncio + +from crawlee import service_locator +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.events import LocalEventManager +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + service_locator.set_configuration(Configuration()) + service_locator.set_storage_client(MemoryStorageClient()) + service_locator.set_event_manager(LocalEventManager()) + + crawler = ParselCrawler() + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +Or pass the services directly to the crawler instance, and they will be set under the hood: + +```python +import asyncio + +from crawlee.configuration import Configuration +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext +from crawlee.events import LocalEventManager +from crawlee.storage_clients import MemoryStorageClient + + +async def main() -> None: + crawler = ParselCrawler( + configuration=Configuration(), + storage_client=MemoryStorageClient(), + event_manager=LocalEventManager(), + ) + + # ... + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +## Conclusion + +We are excited to share that Crawlee v0.5 is here. If you have any questions or feedback, please open a [GitHub discussion](https://github.com/apify/crawlee-python/discussions). If you encounter any bugs, or have an idea for a new feature, please open a [GitHub issue](https://github.com/apify/crawlee-python/issues). diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 9cb5502a7bb0..ea38d6caa64a 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -40,6 +40,7 @@ LukasP: image_url: ./img/lukasp.webp socials: github: Patai5 + MatejV: name: Matěj Volf title: Web Automation Engineer @@ -47,6 +48,7 @@ MatejV: image_url: https://avatars.githubusercontent.com/u/31281386?v=4 socials: github: mvolfik + SatyamT: name: Satyam Tripathi title: Community Member of Crawlee @@ -54,4 +56,11 @@ SatyamT: image_url: https://avatars.githubusercontent.com/u/69134468?v=4 socials: github: triposat - \ No newline at end of file + +VladaD: + name: Vlada Dusek + title: Developer of Crawlee for Python + url: https://github.com/vdusek + image_url: https://avatars.githubusercontent.com/u/25082181?v=4 + socials: + github: vdusek diff --git a/website/versioned_docs/version-3.12/guides/proxy_management.mdx b/website/versioned_docs/version-3.12/guides/proxy_management.mdx index 50ef9b949713..8bf385f1c5b5 100644 --- a/website/versioned_docs/version-3.12/guides/proxy_management.mdx +++ b/website/versioned_docs/version-3.12/guides/proxy_management.mdx @@ -61,7 +61,72 @@ Examples of how to use our proxy URLs with crawlers are shown below in [Crawler All our proxy needs are managed by the `ProxyConfiguration` class. We create an instance using the `ProxyConfiguration` `constructor` function based on the provided options. See the `ProxyConfigurationOptions` for all the possible constructor options. -### Crawler integration +### Static proxy list + +You can provide a static list of proxy URLs to the `proxyUrls` option. The `ProxyConfiguration` will then rotate through the provided proxies. + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: [ + 'http://proxy-1.com', + 'http://proxy-2.com', + null // null means no proxy is used + ] +}); +``` + +This is the simplest way to use a list of proxies. Crawlee will rotate through the list of proxies in a round-robin fashion. + +### Custom proxy function + +The `ProxyConfiguration` class allows you to provide a custom function to pick a proxy URL. This is useful when you want to implement your own logic for selecting a proxy. + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + newUrlFunction: (sessionId, { request }) => { + if (request?.url.includes('crawlee.dev')) { + return null; // for crawlee.dev, we don't use a proxy + } + + return 'http://proxy-1.com'; // for all other URLs, we use this proxy + } +}); +``` + +The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL. + +The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id. + +The `options` parameter is an object containing a `Request`, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not. + +### Tiered proxies + +You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website. + +:::warning + +Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). + +Using this configuration through the `newUrl` calls will not yield the expected results. + +::: + +```javascript +const proxyConfiguration = new ProxyConfiguration({ + tieredProxyUrls: [ + [null], // At first, we try to connect without a proxy + ['http://okay-proxy.com'], + ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'], + ['http://very-good-and-expensive-proxy.com'], + ] +}); +``` + +This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL. + +Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them. + +## Crawler integration `ProxyConfiguration` integrates seamlessly into `HttpCrawler`, `CheerioCrawler`, `JSDOMCrawler`, `PlaywrightCrawler` and `PuppeteerCrawler`. @@ -95,7 +160,7 @@ All our proxy needs are managed by the `proxyConfiguration.newUrl()` allows us to pass a `sessionId` parameter. It will then be used to create a `sessionId`-`proxyUrl` pair, and subsequent `newUrl()` calls with the same `sessionId` will always return the same `proxyUrl`. This is extremely useful in scraping, because we want to create the impression of a real user. See the [session management guide](../guides/session-management) and `SessionPool` class for more information on how keeping a real session helps us avoid blocking. diff --git a/website/yarn.lock b/website/yarn.lock index 46cfcbb992af..7ce0e445dbc2 100644 --- a/website/yarn.lock +++ b/website/yarn.lock @@ -311,10 +311,10 @@ __metadata: languageName: node linkType: hard -"@babel/compat-data@npm:^7.22.6, @babel/compat-data@npm:^7.25.9, @babel/compat-data@npm:^7.26.0": - version: 7.26.3 - resolution: "@babel/compat-data@npm:7.26.3" - checksum: 10c0/d63e71845c34dfad8d7ff8c15b562e620dbf60e68e3abfa35681d24d612594e8e5ec9790d831a287ecd79ce00f48e7ffddc85c5ce94af7242d45917b9c1a5f90 +"@babel/compat-data@npm:^7.22.6, @babel/compat-data@npm:^7.26.0, @babel/compat-data@npm:^7.26.5": + version: 7.26.5 + resolution: "@babel/compat-data@npm:7.26.5" + checksum: 10c0/9d2b41f0948c3dfc5de44d9f789d2208c2ea1fd7eb896dfbb297fe955e696728d6f363c600cd211e7f58ccbc2d834fe516bb1e4cf883bbabed8a32b038afc1a0 languageName: node linkType: hard @@ -341,16 +341,16 @@ __metadata: languageName: node linkType: hard -"@babel/generator@npm:^7.25.9, @babel/generator@npm:^7.26.0, @babel/generator@npm:^7.26.3": - version: 7.26.3 - resolution: "@babel/generator@npm:7.26.3" +"@babel/generator@npm:^7.25.9, @babel/generator@npm:^7.26.0, @babel/generator@npm:^7.26.5": + version: 7.26.5 + resolution: "@babel/generator@npm:7.26.5" dependencies: - "@babel/parser": "npm:^7.26.3" - "@babel/types": "npm:^7.26.3" + "@babel/parser": "npm:^7.26.5" + "@babel/types": "npm:^7.26.5" "@jridgewell/gen-mapping": "npm:^0.3.5" "@jridgewell/trace-mapping": "npm:^0.3.25" jsesc: "npm:^3.0.2" - checksum: 10c0/54f260558e3e4ec8942da3cde607c35349bb983c3a7c5121243f96893fba3e8cd62e1f1773b2051f936f8c8a10987b758d5c7d76dbf2784e95bb63ab4843fa00 + checksum: 10c0/3be79e0aa03f38858a465d12ee2e468320b9122dc44fc85984713e32f16f4d77ce34a16a1a9505972782590e0b8d847b6f373621f9c6fafa1906d90f31416cb0 languageName: node linkType: hard @@ -364,15 +364,15 @@ __metadata: linkType: hard "@babel/helper-compilation-targets@npm:^7.22.6, @babel/helper-compilation-targets@npm:^7.25.9": - version: 7.25.9 - resolution: "@babel/helper-compilation-targets@npm:7.25.9" + version: 7.26.5 + resolution: "@babel/helper-compilation-targets@npm:7.26.5" dependencies: - "@babel/compat-data": "npm:^7.25.9" + "@babel/compat-data": "npm:^7.26.5" "@babel/helper-validator-option": "npm:^7.25.9" browserslist: "npm:^4.24.0" lru-cache: "npm:^5.1.1" semver: "npm:^6.3.1" - checksum: 10c0/a6b26a1e4222e69ef8e62ee19374308f060b007828bc11c65025ecc9e814aba21ff2175d6d3f8bf53c863edd728ee8f94ba7870f8f90a37d39552ad9933a8aaa + checksum: 10c0/9da5c77e5722f1a2fcb3e893049a01d414124522bbf51323bb1a0c9dcd326f15279836450fc36f83c9e8a846f3c40e88be032ed939c5a9840922bed6073edfb4 languageName: node linkType: hard @@ -463,10 +463,10 @@ __metadata: languageName: node linkType: hard -"@babel/helper-plugin-utils@npm:^7.0.0, @babel/helper-plugin-utils@npm:^7.18.6, @babel/helper-plugin-utils@npm:^7.22.5, @babel/helper-plugin-utils@npm:^7.25.9, @babel/helper-plugin-utils@npm:^7.8.0": - version: 7.25.9 - resolution: "@babel/helper-plugin-utils@npm:7.25.9" - checksum: 10c0/483066a1ba36ff16c0116cd24f93de05de746a603a777cd695ac7a1b034928a65a4ecb35f255761ca56626435d7abdb73219eba196f9aa83b6c3c3169325599d +"@babel/helper-plugin-utils@npm:^7.0.0, @babel/helper-plugin-utils@npm:^7.18.6, @babel/helper-plugin-utils@npm:^7.22.5, @babel/helper-plugin-utils@npm:^7.25.9, @babel/helper-plugin-utils@npm:^7.26.5, @babel/helper-plugin-utils@npm:^7.8.0": + version: 7.26.5 + resolution: "@babel/helper-plugin-utils@npm:7.26.5" + checksum: 10c0/cdaba71d4b891aa6a8dfbe5bac2f94effb13e5fa4c2c487667fdbaa04eae059b78b28d85a885071f45f7205aeb56d16759e1bed9c118b94b16e4720ef1ab0f65 languageName: node linkType: hard @@ -484,15 +484,15 @@ __metadata: linkType: hard "@babel/helper-replace-supers@npm:^7.25.9": - version: 7.25.9 - resolution: "@babel/helper-replace-supers@npm:7.25.9" + version: 7.26.5 + resolution: "@babel/helper-replace-supers@npm:7.26.5" dependencies: "@babel/helper-member-expression-to-functions": "npm:^7.25.9" "@babel/helper-optimise-call-expression": "npm:^7.25.9" - "@babel/traverse": "npm:^7.25.9" + "@babel/traverse": "npm:^7.26.5" peerDependencies: "@babel/core": ^7.0.0 - checksum: 10c0/0b40d7d2925bd3ba4223b3519e2e4d2456d471ad69aa458f1c1d1783c80b522c61f8237d3a52afc9e47c7174129bbba650df06393a6787d5722f2ec7f223c3f4 + checksum: 10c0/b19b1245caf835207aaaaac3a494f03a16069ae55e76a2e1350b5acd560e6a820026997a8160e8ebab82ae873e8208759aa008eb8422a67a775df41f0a4633d4 languageName: node linkType: hard @@ -548,14 +548,14 @@ __metadata: languageName: node linkType: hard -"@babel/parser@npm:^7.25.9, @babel/parser@npm:^7.26.0, @babel/parser@npm:^7.26.3": - version: 7.26.3 - resolution: "@babel/parser@npm:7.26.3" +"@babel/parser@npm:^7.25.9, @babel/parser@npm:^7.26.0, @babel/parser@npm:^7.26.5": + version: 7.26.5 + resolution: "@babel/parser@npm:7.26.5" dependencies: - "@babel/types": "npm:^7.26.3" + "@babel/types": "npm:^7.26.5" bin: parser: ./bin/babel-parser.js - checksum: 10c0/48f736374e61cfd10ddbf7b80678514ae1f16d0e88bc793d2b505d73d9b987ea786fc8c2f7ee8f8b8c467df062030eb07fd0eb2168f0f541ca1f542775852cad + checksum: 10c0/2e77dd99ee028ee3c10fa03517ae1169f2432751adf71315e4dc0d90b61639d51760d622f418f6ac665ae4ea65f8485232a112ea0e76f18e5900225d3d19a61e languageName: node linkType: hard @@ -732,13 +732,13 @@ __metadata: linkType: hard "@babel/plugin-transform-block-scoped-functions@npm:^7.25.9": - version: 7.25.9 - resolution: "@babel/plugin-transform-block-scoped-functions@npm:7.25.9" + version: 7.26.5 + resolution: "@babel/plugin-transform-block-scoped-functions@npm:7.26.5" dependencies: - "@babel/helper-plugin-utils": "npm:^7.25.9" + "@babel/helper-plugin-utils": "npm:^7.26.5" peerDependencies: "@babel/core": ^7.0.0-0 - checksum: 10c0/e92ba0e3d72c038513844d8fca1cc8437dcb35cd42778e97fd03cb8303380b201468611e7ecfdcae3de33473b2679fe2de1552c5f925d112c5693425cf851f10 + checksum: 10c0/2f3060800ead46b09971dd7bf830d66383b7bc61ced9945633b4ef9bf87787956ea83fcf49b387cecb377812588c6b81681714c760f9cf89ecba45edcbab1192 languageName: node linkType: hard @@ -1027,13 +1027,13 @@ __metadata: linkType: hard "@babel/plugin-transform-nullish-coalescing-operator@npm:^7.25.9": - version: 7.25.9 - resolution: "@babel/plugin-transform-nullish-coalescing-operator@npm:7.25.9" + version: 7.26.5 + resolution: "@babel/plugin-transform-nullish-coalescing-operator@npm:7.26.5" dependencies: - "@babel/helper-plugin-utils": "npm:^7.25.9" + "@babel/helper-plugin-utils": "npm:^7.26.5" peerDependencies: "@babel/core": ^7.0.0-0 - checksum: 10c0/eb623db5be078a1c974afe7c7797b0309ba2ea9e9237c0b6831ade0f56d8248bb4ab3432ab34495ff8c877ec2fe412ff779d1e9b3c2b8139da18e1753d950bc3 + checksum: 10c0/2e4b84745f9e8c40caf3e611641de4d6c7da6f96c2925b7fe568e3b031ed1864e325b9dffc9cda4e442fc40be43ffabb088782e980d411e0562bd5222df547ec languageName: node linkType: hard @@ -1311,17 +1311,17 @@ __metadata: linkType: hard "@babel/plugin-transform-typescript@npm:^7.25.9": - version: 7.26.3 - resolution: "@babel/plugin-transform-typescript@npm:7.26.3" + version: 7.26.5 + resolution: "@babel/plugin-transform-typescript@npm:7.26.5" dependencies: "@babel/helper-annotate-as-pure": "npm:^7.25.9" "@babel/helper-create-class-features-plugin": "npm:^7.25.9" - "@babel/helper-plugin-utils": "npm:^7.25.9" + "@babel/helper-plugin-utils": "npm:^7.26.5" "@babel/helper-skip-transparent-expression-wrappers": "npm:^7.25.9" "@babel/plugin-syntax-typescript": "npm:^7.25.9" peerDependencies: "@babel/core": ^7.0.0-0 - checksum: 10c0/0a0509ec56666fab5b557d573254665956a377916fc1e7cee309c0711d11257338ba7ee678db03603a3985d2c6c0b210b788fb6b9616d8fc0595469e39089a8f + checksum: 10c0/64204b1f1c77d896142071cc174e7bb4fbc597bdc0ea73aec8de1a72d252755db381b9ed40342fc283c32387d7375d0803d0aed8262dd503633f5035148d47a0 languageName: node linkType: hard @@ -1525,28 +1525,28 @@ __metadata: languageName: node linkType: hard -"@babel/traverse@npm:^7.25.9": - version: 7.26.4 - resolution: "@babel/traverse@npm:7.26.4" +"@babel/traverse@npm:^7.25.9, @babel/traverse@npm:^7.26.5": + version: 7.26.5 + resolution: "@babel/traverse@npm:7.26.5" dependencies: "@babel/code-frame": "npm:^7.26.2" - "@babel/generator": "npm:^7.26.3" - "@babel/parser": "npm:^7.26.3" + "@babel/generator": "npm:^7.26.5" + "@babel/parser": "npm:^7.26.5" "@babel/template": "npm:^7.25.9" - "@babel/types": "npm:^7.26.3" + "@babel/types": "npm:^7.26.5" debug: "npm:^4.3.1" globals: "npm:^11.1.0" - checksum: 10c0/cf25d0eda9505daa0f0832ad786b9e28c9d967e823aaf7fbe425250ab198c656085495aa6bed678b27929e095c84eea9fd778b851a31803da94c9bc4bf4eaef7 + checksum: 10c0/0779059ecf63e31446564cf31adf170e701e8017ef02c819c57924a9a83d6b2ce41dbff3ef295589da9410497a3e575655bb8084ca470e0ab1bc193128afa9fe languageName: node linkType: hard -"@babel/types@npm:^7.21.3, @babel/types@npm:^7.25.9, @babel/types@npm:^7.26.0, @babel/types@npm:^7.26.3, @babel/types@npm:^7.4.4": - version: 7.26.3 - resolution: "@babel/types@npm:7.26.3" +"@babel/types@npm:^7.21.3, @babel/types@npm:^7.25.9, @babel/types@npm:^7.26.0, @babel/types@npm:^7.26.5, @babel/types@npm:^7.4.4": + version: 7.26.5 + resolution: "@babel/types@npm:7.26.5" dependencies: "@babel/helper-string-parser": "npm:^7.25.9" "@babel/helper-validator-identifier": "npm:^7.25.9" - checksum: 10c0/966c5242c5e55c8704bf7a7418e7be2703a0afa4d19a8480999d5a4ef13d095dd60686615fe5983cb7593b4b06ba3a7de8d6ca501c1d78bdd233a10d90be787b + checksum: 10c0/0278053b69d7c2b8573aa36dc5242cad95f0d965e1c0ed21ccacac6330092e59ba5949753448f6d6eccf6ad59baaef270295cc05218352e060ea8c68388638c4 languageName: node linkType: hard @@ -2895,9 +2895,9 @@ __metadata: linkType: hard "@lit-labs/ssr-dom-shim@npm:^1.2.0": - version: 1.2.1 - resolution: "@lit-labs/ssr-dom-shim@npm:1.2.1" - checksum: 10c0/75cecf2cc4c1a089c6984d9f45b8264e3b4947b4ebed96aef7eb201bd6b3f26caeaafedf457884ac38d4f2d99cddaf94a4b2414c02c61fbf1f64c0a0dade11f4 + version: 1.3.0 + resolution: "@lit-labs/ssr-dom-shim@npm:1.3.0" + checksum: 10c0/743a9b295ef2f186712f08883da553c9990be291409615309c99aa4946cfe440a184e4213c790c24505c80beb86b9cfecf10b5fb30ce17c83698f8424f48678d languageName: node linkType: hard @@ -3508,90 +3508,90 @@ __metadata: languageName: node linkType: hard -"@swc/core-darwin-arm64@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-darwin-arm64@npm:1.10.6" +"@swc/core-darwin-arm64@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-darwin-arm64@npm:1.10.7" conditions: os=darwin & cpu=arm64 languageName: node linkType: hard -"@swc/core-darwin-x64@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-darwin-x64@npm:1.10.6" +"@swc/core-darwin-x64@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-darwin-x64@npm:1.10.7" conditions: os=darwin & cpu=x64 languageName: node linkType: hard -"@swc/core-linux-arm-gnueabihf@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-linux-arm-gnueabihf@npm:1.10.6" +"@swc/core-linux-arm-gnueabihf@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-linux-arm-gnueabihf@npm:1.10.7" conditions: os=linux & cpu=arm languageName: node linkType: hard -"@swc/core-linux-arm64-gnu@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-linux-arm64-gnu@npm:1.10.6" +"@swc/core-linux-arm64-gnu@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-linux-arm64-gnu@npm:1.10.7" conditions: os=linux & cpu=arm64 & libc=glibc languageName: node linkType: hard -"@swc/core-linux-arm64-musl@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-linux-arm64-musl@npm:1.10.6" +"@swc/core-linux-arm64-musl@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-linux-arm64-musl@npm:1.10.7" conditions: os=linux & cpu=arm64 & libc=musl languageName: node linkType: hard -"@swc/core-linux-x64-gnu@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-linux-x64-gnu@npm:1.10.6" +"@swc/core-linux-x64-gnu@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-linux-x64-gnu@npm:1.10.7" conditions: os=linux & cpu=x64 & libc=glibc languageName: node linkType: hard -"@swc/core-linux-x64-musl@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-linux-x64-musl@npm:1.10.6" +"@swc/core-linux-x64-musl@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-linux-x64-musl@npm:1.10.7" conditions: os=linux & cpu=x64 & libc=musl languageName: node linkType: hard -"@swc/core-win32-arm64-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-win32-arm64-msvc@npm:1.10.6" +"@swc/core-win32-arm64-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-win32-arm64-msvc@npm:1.10.7" conditions: os=win32 & cpu=arm64 languageName: node linkType: hard -"@swc/core-win32-ia32-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-win32-ia32-msvc@npm:1.10.6" +"@swc/core-win32-ia32-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-win32-ia32-msvc@npm:1.10.7" conditions: os=win32 & cpu=ia32 languageName: node linkType: hard -"@swc/core-win32-x64-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/core-win32-x64-msvc@npm:1.10.6" +"@swc/core-win32-x64-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/core-win32-x64-msvc@npm:1.10.7" conditions: os=win32 & cpu=x64 languageName: node linkType: hard "@swc/core@npm:^1.7.39": - version: 1.10.6 - resolution: "@swc/core@npm:1.10.6" - dependencies: - "@swc/core-darwin-arm64": "npm:1.10.6" - "@swc/core-darwin-x64": "npm:1.10.6" - "@swc/core-linux-arm-gnueabihf": "npm:1.10.6" - "@swc/core-linux-arm64-gnu": "npm:1.10.6" - "@swc/core-linux-arm64-musl": "npm:1.10.6" - "@swc/core-linux-x64-gnu": "npm:1.10.6" - "@swc/core-linux-x64-musl": "npm:1.10.6" - "@swc/core-win32-arm64-msvc": "npm:1.10.6" - "@swc/core-win32-ia32-msvc": "npm:1.10.6" - "@swc/core-win32-x64-msvc": "npm:1.10.6" + version: 1.10.7 + resolution: "@swc/core@npm:1.10.7" + dependencies: + "@swc/core-darwin-arm64": "npm:1.10.7" + "@swc/core-darwin-x64": "npm:1.10.7" + "@swc/core-linux-arm-gnueabihf": "npm:1.10.7" + "@swc/core-linux-arm64-gnu": "npm:1.10.7" + "@swc/core-linux-arm64-musl": "npm:1.10.7" + "@swc/core-linux-x64-gnu": "npm:1.10.7" + "@swc/core-linux-x64-musl": "npm:1.10.7" + "@swc/core-win32-arm64-msvc": "npm:1.10.7" + "@swc/core-win32-ia32-msvc": "npm:1.10.7" + "@swc/core-win32-x64-msvc": "npm:1.10.7" "@swc/counter": "npm:^0.1.3" "@swc/types": "npm:^0.1.17" peerDependencies: @@ -3620,7 +3620,7 @@ __metadata: peerDependenciesMeta: "@swc/helpers": optional: true - checksum: 10c0/40cd7c29b454feecb7f9b8010a28d5650c4435ce15b26c9467fb650fee9cef35f88f16a22c30aafbf294c9e1588ebb55ced07acaaac93b5b52349070c810c930 + checksum: 10c0/73d3b164620590aff57512125e3cfd6dc1bb3346882fa9ad12abf8029f8be01eb71e6afc3c760c3e2cb479a2d7ff3180bf298f907768b93e3eac15fc72e0d855 languageName: node linkType: hard @@ -3631,91 +3631,91 @@ __metadata: languageName: node linkType: hard -"@swc/html-darwin-arm64@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-darwin-arm64@npm:1.10.6" +"@swc/html-darwin-arm64@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-darwin-arm64@npm:1.10.7" conditions: os=darwin & cpu=arm64 languageName: node linkType: hard -"@swc/html-darwin-x64@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-darwin-x64@npm:1.10.6" +"@swc/html-darwin-x64@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-darwin-x64@npm:1.10.7" conditions: os=darwin & cpu=x64 languageName: node linkType: hard -"@swc/html-linux-arm-gnueabihf@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-linux-arm-gnueabihf@npm:1.10.6" +"@swc/html-linux-arm-gnueabihf@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-linux-arm-gnueabihf@npm:1.10.7" conditions: os=linux & cpu=arm languageName: node linkType: hard -"@swc/html-linux-arm64-gnu@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-linux-arm64-gnu@npm:1.10.6" +"@swc/html-linux-arm64-gnu@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-linux-arm64-gnu@npm:1.10.7" conditions: os=linux & cpu=arm64 & libc=glibc languageName: node linkType: hard -"@swc/html-linux-arm64-musl@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-linux-arm64-musl@npm:1.10.6" +"@swc/html-linux-arm64-musl@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-linux-arm64-musl@npm:1.10.7" conditions: os=linux & cpu=arm64 & libc=musl languageName: node linkType: hard -"@swc/html-linux-x64-gnu@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-linux-x64-gnu@npm:1.10.6" +"@swc/html-linux-x64-gnu@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-linux-x64-gnu@npm:1.10.7" conditions: os=linux & cpu=x64 & libc=glibc languageName: node linkType: hard -"@swc/html-linux-x64-musl@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-linux-x64-musl@npm:1.10.6" +"@swc/html-linux-x64-musl@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-linux-x64-musl@npm:1.10.7" conditions: os=linux & cpu=x64 & libc=musl languageName: node linkType: hard -"@swc/html-win32-arm64-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-win32-arm64-msvc@npm:1.10.6" +"@swc/html-win32-arm64-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-win32-arm64-msvc@npm:1.10.7" conditions: os=win32 & cpu=arm64 languageName: node linkType: hard -"@swc/html-win32-ia32-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-win32-ia32-msvc@npm:1.10.6" +"@swc/html-win32-ia32-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-win32-ia32-msvc@npm:1.10.7" conditions: os=win32 & cpu=ia32 languageName: node linkType: hard -"@swc/html-win32-x64-msvc@npm:1.10.6": - version: 1.10.6 - resolution: "@swc/html-win32-x64-msvc@npm:1.10.6" +"@swc/html-win32-x64-msvc@npm:1.10.7": + version: 1.10.7 + resolution: "@swc/html-win32-x64-msvc@npm:1.10.7" conditions: os=win32 & cpu=x64 languageName: node linkType: hard "@swc/html@npm:^1.7.39": - version: 1.10.6 - resolution: "@swc/html@npm:1.10.6" + version: 1.10.7 + resolution: "@swc/html@npm:1.10.7" dependencies: "@swc/counter": "npm:^0.1.3" - "@swc/html-darwin-arm64": "npm:1.10.6" - "@swc/html-darwin-x64": "npm:1.10.6" - "@swc/html-linux-arm-gnueabihf": "npm:1.10.6" - "@swc/html-linux-arm64-gnu": "npm:1.10.6" - "@swc/html-linux-arm64-musl": "npm:1.10.6" - "@swc/html-linux-x64-gnu": "npm:1.10.6" - "@swc/html-linux-x64-musl": "npm:1.10.6" - "@swc/html-win32-arm64-msvc": "npm:1.10.6" - "@swc/html-win32-ia32-msvc": "npm:1.10.6" - "@swc/html-win32-x64-msvc": "npm:1.10.6" + "@swc/html-darwin-arm64": "npm:1.10.7" + "@swc/html-darwin-x64": "npm:1.10.7" + "@swc/html-linux-arm-gnueabihf": "npm:1.10.7" + "@swc/html-linux-arm64-gnu": "npm:1.10.7" + "@swc/html-linux-arm64-musl": "npm:1.10.7" + "@swc/html-linux-x64-gnu": "npm:1.10.7" + "@swc/html-linux-x64-musl": "npm:1.10.7" + "@swc/html-win32-arm64-msvc": "npm:1.10.7" + "@swc/html-win32-ia32-msvc": "npm:1.10.7" + "@swc/html-win32-x64-msvc": "npm:1.10.7" dependenciesMeta: "@swc/html-darwin-arm64": optional: true @@ -3737,7 +3737,7 @@ __metadata: optional: true "@swc/html-win32-x64-msvc": optional: true - checksum: 10c0/bfba8ae1afc10059e4185f537e2fdb8f03ba2dca332766e56ab19588795597b3da35641f6e9d0de07673e6cd8e43086c289a747834e6842d256e8a03e3d49380 + checksum: 10c0/7095cbb1be62221dd66431b6a51309ac94c329aa2b52b2ef1384647a589213e685e5c6e9e0c5a5ad6b0701720e36c18a33ee4273c08c4b54f455254ced12792e languageName: node linkType: hard @@ -8203,8 +8203,8 @@ __metadata: linkType: hard "glob@npm:^11.0.0": - version: 11.0.0 - resolution: "glob@npm:11.0.0" + version: 11.0.1 + resolution: "glob@npm:11.0.1" dependencies: foreground-child: "npm:^3.1.0" jackspeak: "npm:^4.0.1" @@ -8214,7 +8214,7 @@ __metadata: path-scurry: "npm:^2.0.0" bin: glob: dist/esm/bin.mjs - checksum: 10c0/419866015d8795258a8ac51de5b9d1a99c72634fc3ead93338e4da388e89773ab21681e494eac0fbc4250b003451ca3110bb4f1c9393d15d14466270094fdb4e + checksum: 10c0/2b32588be52e9e90f914c7d8dec32f3144b81b84054b0f70e9adfebf37cd7014570489f2a79d21f7801b9a4bd4cca94f426966bfd00fb64a5b705cfe10da3a03 languageName: node linkType: hard diff --git a/yarn.lock b/yarn.lock index e50e3015a9cc..c64f1f7852c1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -169,23 +169,23 @@ __metadata: linkType: hard "@babel/parser@npm:^7.25.4": - version: 7.26.3 - resolution: "@babel/parser@npm:7.26.3" + version: 7.26.5 + resolution: "@babel/parser@npm:7.26.5" dependencies: - "@babel/types": "npm:^7.26.3" + "@babel/types": "npm:^7.26.5" bin: parser: ./bin/babel-parser.js - checksum: 10c0/48f736374e61cfd10ddbf7b80678514ae1f16d0e88bc793d2b505d73d9b987ea786fc8c2f7ee8f8b8c467df062030eb07fd0eb2168f0f541ca1f542775852cad + checksum: 10c0/2e77dd99ee028ee3c10fa03517ae1169f2432751adf71315e4dc0d90b61639d51760d622f418f6ac665ae4ea65f8485232a112ea0e76f18e5900225d3d19a61e languageName: node linkType: hard -"@babel/types@npm:^7.25.4, @babel/types@npm:^7.26.3": - version: 7.26.3 - resolution: "@babel/types@npm:7.26.3" +"@babel/types@npm:^7.25.4, @babel/types@npm:^7.26.5": + version: 7.26.5 + resolution: "@babel/types@npm:7.26.5" dependencies: "@babel/helper-string-parser": "npm:^7.25.9" "@babel/helper-validator-identifier": "npm:^7.25.9" - checksum: 10c0/966c5242c5e55c8704bf7a7418e7be2703a0afa4d19a8480999d5a4ef13d095dd60686615fe5983cb7593b4b06ba3a7de8d6ca501c1d78bdd233a10d90be787b + checksum: 10c0/0278053b69d7c2b8573aa36dc5242cad95f0d965e1c0ed21ccacac6330092e59ba5949753448f6d6eccf6ad59baaef270295cc05218352e060ea8c68388638c4 languageName: node linkType: hard @@ -771,7 +771,7 @@ __metadata: cross-env: "npm:^7.0.3" deep-equal: "npm:^2.0.5" eslint: "npm:^8.57.1" - eslint-config-prettier: "npm:^9.1.0" + eslint-config-prettier: "npm:^10.0.0" express: "npm:^4.18.1" fs-extra: "npm:^11.0.0" gen-esm-wrapper: "npm:^1.1.3" @@ -785,7 +785,7 @@ __metadata: playwright: "npm:1.49.1" portastic: "npm:^1.0.1" proxy: "npm:^1.0.2" - puppeteer: "npm:23.11.1" + puppeteer: "npm:24.1.0" rimraf: "npm:^6.0.0" tsx: "npm:^4.4.0" turbo: "npm:^2.1.0" @@ -2031,9 +2031,9 @@ __metadata: languageName: node linkType: hard -"@puppeteer/browsers@npm:2.6.1": - version: 2.6.1 - resolution: "@puppeteer/browsers@npm:2.6.1" +"@puppeteer/browsers@npm:2.7.0": + version: 2.7.0 + resolution: "@puppeteer/browsers@npm:2.7.0" dependencies: debug: "npm:^4.4.0" extract-zip: "npm:^2.0.1" @@ -2045,7 +2045,7 @@ __metadata: yargs: "npm:^17.7.2" bin: browsers: lib/cjs/main-cli.js - checksum: 10c0/31d4951eec40515769467be3878d3581fe0e50227f2a9fa865e9f872e4a003262996c412a1d48d9c800665b3aa91bb1c2d971eaa314ef10e536d08e63f2f40d3 + checksum: 10c0/1b71c89337d04603621a4d19a0e66277453a1e8f41410d777e162ee02e83b0882b7595869c1351bf14ef6fb7d435faac798aa5239fa714296da7faf4ec1f1452 languageName: node linkType: hard @@ -3188,8 +3188,8 @@ __metadata: linkType: hard "apify-client@npm:^2.9.0": - version: 2.11.0 - resolution: "apify-client@npm:2.11.0" + version: 2.11.1 + resolution: "apify-client@npm:2.11.1" dependencies: "@apify/consts": "npm:^2.25.0" "@apify/log": "npm:^2.2.6" @@ -3201,7 +3201,7 @@ __metadata: ow: "npm:^0.28.2" tslib: "npm:^2.5.0" type-fest: "npm:^4.0.0" - checksum: 10c0/d02ea6e781f13a18a621c24fe2057a972df6d3351460e928a45e5273885c42793a3b86b44d5f7029d6aa3c21e3adc5339435d7ec859144962e55bd43930adb09 + checksum: 10c0/6f490c054f67b46b33e663874f4cfd592f4ed071d9c3164ff15008c5851faf9b3451ac73376ee9c4fbb7fe47f293dfc137494b2aad45e7b4a01d1307f3078109 languageName: node linkType: hard @@ -3527,9 +3527,9 @@ __metadata: linkType: hard "bare-events@npm:^2.0.0, bare-events@npm:^2.2.0": - version: 2.5.3 - resolution: "bare-events@npm:2.5.3" - checksum: 10c0/fc78e068cd1c7e75ab027121b69f104e315af122f10263734a1f3a7c5a8e2e5934d9a46638f5c9eafadf84d64c01fd87cd3169da4f7f8046df29a17fb1c532f5 + version: 2.5.4 + resolution: "bare-events@npm:2.5.4" + checksum: 10c0/877a9cea73d545e2588cdbd6fd01653e27dac48ad6b44985cdbae73e1f57f292d4ba52e25d1fba53674c1053c463d159f3d5c7bc36a2e6e192e389b499ddd627 languageName: node linkType: hard @@ -4029,6 +4029,18 @@ __metadata: languageName: node linkType: hard +"chromium-bidi@npm:0.12.0": + version: 0.12.0 + resolution: "chromium-bidi@npm:0.12.0" + dependencies: + mitt: "npm:3.0.1" + zod: "npm:3.24.1" + peerDependencies: + devtools-protocol: "*" + checksum: 10c0/dbddf97e9c829922078dc40e069c7ba5d5949c0902dde624000299027b2ecb4fb905068c5f3e67be619d5ff1906795629da676aa3ae1ac53adf719893d757f4f + languageName: node + linkType: hard + "ci-info@npm:^3.1.0, ci-info@npm:^3.2.0": version: 3.9.0 resolution: "ci-info@npm:3.9.0" @@ -4871,16 +4883,16 @@ __metadata: linkType: hard "devtools-protocol@npm:*": - version: 0.0.1403989 - resolution: "devtools-protocol@npm:0.0.1403989" - checksum: 10c0/312b0e55e4bd66170c5502edbea0ca866fceaf69c13d72d4e92d4d7d353550d766a6edbc37355d8d03c56e36939f454680f7e8f445ebbdb62d1b6e7ebc57610b + version: 0.0.1404580 + resolution: "devtools-protocol@npm:0.0.1404580" + checksum: 10c0/207133db0cf186efe71bac3d25e9f70270e62dd9488e38159c115f3edac4fb156bdd954af294e7fa015f316b8c236d15dca3d051fa27de609a3de4bc0d1dc80f languageName: node linkType: hard -"devtools-protocol@npm:0.0.1367902": - version: 0.0.1367902 - resolution: "devtools-protocol@npm:0.0.1367902" - checksum: 10c0/be4017f2bfd04474d718daca0e88e062f4afceb2f311662d717f4eae5bda3473da748a68ff1bf2326a67ce35c37af33932190fe8ef1d36c8ef22576befdc57c4 +"devtools-protocol@npm:0.0.1380148": + version: 0.0.1380148 + resolution: "devtools-protocol@npm:0.0.1380148" + checksum: 10c0/489cb7af6890e19a2815fabcbc5178a9d3e510e25680a0eb28567d76a7664e8db228ea221079ef7e5a9619e37951cf71191df641b86b7c47d8ed4f1aa7edb4db languageName: node linkType: hard @@ -5570,14 +5582,14 @@ __metadata: languageName: node linkType: hard -"eslint-config-prettier@npm:^9.1.0": - version: 9.1.0 - resolution: "eslint-config-prettier@npm:9.1.0" +"eslint-config-prettier@npm:^10.0.0": + version: 10.0.1 + resolution: "eslint-config-prettier@npm:10.0.1" peerDependencies: eslint: ">=7.0.0" bin: - eslint-config-prettier: bin/cli.js - checksum: 10c0/6d332694b36bc9ac6fdb18d3ca2f6ac42afa2ad61f0493e89226950a7091e38981b66bac2b47ba39d15b73fff2cd32c78b850a9cf9eed9ca9a96bfb2f3a2f10d + eslint-config-prettier: build/bin/cli.js + checksum: 10c0/e2434931669d211663c0493f2c1640a670a02ba4503a68f056a7eda133f383acbbb983a4a7bd0ad6cb3b2bc4d5731c3be8b32fe28e35087a76fea45f7061ae70 languageName: node linkType: hard @@ -6707,8 +6719,8 @@ __metadata: linkType: hard "glob@npm:^11.0.0": - version: 11.0.0 - resolution: "glob@npm:11.0.0" + version: 11.0.1 + resolution: "glob@npm:11.0.1" dependencies: foreground-child: "npm:^3.1.0" jackspeak: "npm:^4.0.1" @@ -6718,7 +6730,7 @@ __metadata: path-scurry: "npm:^2.0.0" bin: glob: dist/esm/bin.mjs - checksum: 10c0/419866015d8795258a8ac51de5b9d1a99c72634fc3ead93338e4da388e89773ab21681e494eac0fbc4250b003451ca3110bb4f1c9393d15d14466270094fdb4e + checksum: 10c0/2b32588be52e9e90f914c7d8dec32f3144b81b84054b0f70e9adfebf37cd7014570489f2a79d21f7801b9a4bd4cca94f426966bfd00fb64a5b705cfe10da3a03 languageName: node linkType: hard @@ -10670,33 +10682,33 @@ __metadata: languageName: node linkType: hard -"puppeteer-core@npm:23.11.1": - version: 23.11.1 - resolution: "puppeteer-core@npm:23.11.1" +"puppeteer-core@npm:24.1.0": + version: 24.1.0 + resolution: "puppeteer-core@npm:24.1.0" dependencies: - "@puppeteer/browsers": "npm:2.6.1" + "@puppeteer/browsers": "npm:2.7.0" chromium-bidi: "npm:0.11.0" debug: "npm:^4.4.0" - devtools-protocol: "npm:0.0.1367902" + devtools-protocol: "npm:0.0.1380148" typed-query-selector: "npm:^2.12.0" ws: "npm:^8.18.0" - checksum: 10c0/6512a3dca8c7bea620219332b84c4442754fead6c5021c26ea395ddc2f84610a54accf185ba1450e02885cb063c2d12f96eb5f18e7e1b6795f3e32a4b8a2102e + checksum: 10c0/3ff0f4b6f1b86a8de973cd6a3e4d2db562062b88ef4d565ba986e3114c04e7cf61d16ecc90b0e2db29ae5adeb7bb260dfaac614d4e35be1c0d1432061edaebb7 languageName: node linkType: hard -"puppeteer@npm:23.11.1": - version: 23.11.1 - resolution: "puppeteer@npm:23.11.1" +"puppeteer@npm:24.1.0": + version: 24.1.0 + resolution: "puppeteer@npm:24.1.0" dependencies: - "@puppeteer/browsers": "npm:2.6.1" - chromium-bidi: "npm:0.11.0" + "@puppeteer/browsers": "npm:2.7.0" + chromium-bidi: "npm:0.12.0" cosmiconfig: "npm:^9.0.0" - devtools-protocol: "npm:0.0.1367902" - puppeteer-core: "npm:23.11.1" + devtools-protocol: "npm:0.0.1380148" + puppeteer-core: "npm:24.1.0" typed-query-selector: "npm:^2.12.0" bin: puppeteer: lib/cjs/puppeteer/node/cli.js - checksum: 10c0/e967f5ce02ab9e0343eb4403f32ab7de8a6dbeffe6b23be8725e112015ae4a60264a554742cf10302434795a8e9ea27ec9b048126fee23750ce24c3b238d2ebc + checksum: 10c0/b2c84610fdd0ea5cf0f1f0b53d24c34e87093e9012616fad9ffa96f447a874be48d68ad9804a344016d30cd18fddfeb67fd429d7a52c44aa445e06b4f93e2417 languageName: node linkType: hard @@ -13469,3 +13481,10 @@ __metadata: checksum: 10c0/8f14c87d6b1b53c944c25ce7a28616896319d95bc46a9660fe441adc0ed0a81253b02b5abdaeffedbeb23bdd25a0bf1c29d2c12dd919aef6447652dd295e3e69 languageName: node linkType: hard + +"zod@npm:3.24.1": + version: 3.24.1 + resolution: "zod@npm:3.24.1" + checksum: 10c0/0223d21dbaa15d8928fe0da3b54696391d8e3e1e2d0283a1a070b5980a1dbba945ce631c2d1eccc088fdbad0f2dfa40155590bf83732d3ac4fcca2cc9237591b + languageName: node + linkType: hard