You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am running into a weird issue where I tried transferring the login state by sharing the user data. I shared the user data by first performing the login using on_browser_created hook and sharing the state with another AsyncWebCrawler. However, I still have to perform the login for the second AsyncWebCrawler. Here's my code.
async def on_browser_created_hook(cls, browser):
logger.info("[HOOK] on_browser_created")
context = browser.contexts[0]
page = await context.new_page()
# Navigate to login page
print("Please log in manually in the browser.")
await page.wait_for_load_state("networkidle")
# Export the storage state after manual login
await context.storage_state(path="my_storage_state.json")
await page.close()
# First run: perform login and store state
async with AsyncWebCrawler(
headless=False,
verbose=True,
hooks={"on_browser_created": cls.on_browser_created_hook},
use_persistent_context=True,
user_data_dir="./my_user_data",
) as crawler:
result = await crawler.arun(
url=auth_url,
cache_mode=CacheMode.BYPASS,
)
if result.success:
print("SSO login success", result.success)
async with AsyncWebCrawler(
verbose=True,
headless=True,
use_persistent_context=True,
text_only=True,
light_mode=True,
user_data_dir="./my_user_data",
storage_state="my_storage_state.json",
) as crawler:
scraper = Scraper(
crawler=crawler,
kwargs=kwargs,
urls=urls,
workers=workers,
limit=page_limit,
max_depth=depth,
)
await scraper.run()
logger.info(f"Crawled {len(scraper.results)} pages across all websites:")
When I try the same thing using playwright I am able to share the user data without having to login again.
Here's the playwright code
def authenticate_and_save_state():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False) # Open headed browser for SSO
context = browser.new_context()
page = context.new_page()
page.goto('https:/auth-url.com/')
# Perform SSO login manually or automatically
input("Please complete the SSO login in the browser and press Enter here...")
# Save the session state (cookies, local storage, etc.)
context.storage_state(path='auth_state.json')
browser.close()
print("Authentication state saved to auth_state.json")
def crawl_and_print_page():
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(storage_state='auth_state.json') # Use the state from the mounted file
page = context.new_page()
# Navigate to the protected page you want to crawl
page.goto('https://my-protected-page/')
page.wait_for_load_state('networkidle')
print(page.content())
# page.screenshot(path='protected_page_screenshot.png')
browser.close()
The text was updated successfully, but these errors were encountered:
I am running into a weird issue where I tried transferring the login state by sharing the user data. I shared the user data by first performing the login using on_browser_created hook and sharing the state with another AsyncWebCrawler. However, I still have to perform the login for the second AsyncWebCrawler. Here's my code.
When I try the same thing using playwright I am able to share the user data without having to login again.
Here's the playwright code
The text was updated successfully, but these errors were encountered: