diff --git a/Cargo.toml b/Cargo.toml index 91d2dec4..ef7d9746 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,8 @@ tokio = { version = "1", features = [ tracing = "0.1" pin-project-lite = "0.2" dunce = "1" +bytes = { version = "1.4.0", features = ["serde"], optional = true } +reqwest = { version = "0.11.20", features = [ "brotli", "gzip", "deflate", "native-tls-alpn", "stream" ] } [target.'cfg(windows)'.dependencies] winreg = "0.50" @@ -54,10 +56,11 @@ tracing-subscriber = "0.3" tokio = { version = "1", features = ["rt-multi-thread", "time", "macros"] } [features] -default = ["async-std-runtime"] +default = ["async-std-runtime", "bytes"] async-std-runtime = ["async-std", "async-tungstenite/async-std-runtime"] tokio-runtime = ["tokio", "async-tungstenite/tokio-runtime"] fetcher = [] +bytes = ["dep:bytes"] # Temporary features until cargo weak dependencies bug is fixed # See https://github.com/rust-lang/cargo/issues/10801 diff --git a/examples/block-navigation.rs b/examples/block-navigation.rs index f5b41119..ee9009ee 100644 --- a/examples/block-navigation.rs +++ b/examples/block-navigation.rs @@ -103,7 +103,7 @@ async fn main() -> Result<(), Box> { // Navigate to target page.goto("http://google.com").await?; let content = page.content().await?; - println!("Content: {content}"); + println!("Content: {:?}", content); browser.close().await?; browser_handle.await; diff --git a/src/browser.rs b/src/browser.rs index a896b79a..994f2fdd 100644 --- a/src/browser.rs +++ b/src/browser.rs @@ -51,33 +51,82 @@ pub struct Browser { browser_context: BrowserContext, } -impl Browser { - /// Connect to an already running chromium instance via websocket - pub async fn connect(debug_ws_url: impl Into) -> Result<(Self, Handler)> { - let debug_ws_url = debug_ws_url.into(); - let conn = Connection::::connect(&debug_ws_url).await?; - - let (tx, rx) = channel(1); - - let fut = Handler::new(conn, rx, HandlerConfig::default()); - let browser_context = fut.default_browser_context().clone(); +/// Browser connection information. +#[derive(serde::Deserialize, Debug, Default)] +pub struct BrowserConnection { + #[serde(rename = "Browser")] + /// The browser name + pub browser: String, + #[serde(rename = "Protocol-Version")] + /// Browser version + pub protocol_version: String, + #[serde(rename = "User-Agent")] + /// User Agent used by default. + pub user_agent: String, + #[serde(rename = "V8-Version")] + /// The v8 engine version + pub v8_version: String, + #[serde(rename = "WebKit-Version")] + /// Webkit version + pub webkit_version: String, + #[serde(rename = "webSocketDebuggerUrl")] + /// Remote debugging address + pub web_socket_debugger_url: String, +} - let browser = Self { - sender: tx, - config: None, - child: None, - debug_ws_url, - browser_context, - }; - Ok((browser, fut)) +impl Browser { + /// Connect to an already running chromium instance via the given URL. + /// + /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint. + pub async fn connect(url: impl Into) -> Result<(Self, Handler)> { + Self::connect_with_config(url, HandlerConfig::default()).await } - // Connect to an already running chromium instance via websocket with HandlerConfig + // Connect to an already running chromium instance with a given `HandlerConfig`. + /// + /// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint. pub async fn connect_with_config( - debug_ws_url: impl Into, + url: impl Into, config: HandlerConfig, ) -> Result<(Self, Handler)> { - let debug_ws_url = debug_ws_url.into(); + let mut debug_ws_url = url.into(); + + if debug_ws_url.starts_with("http") { + match reqwest::Client::new() + .get( + if debug_ws_url.ends_with("/json/version") + || debug_ws_url.ends_with("/json/version/") + { + debug_ws_url.clone() + } else { + format!( + "{}{}json/version", + &debug_ws_url, + if debug_ws_url.ends_with('/') { "" } else { "/" } + ) + }, + ) + .header("content-type", "application/json") + .send() + .await + { + Ok(req) => { + let socketaddr = req.remote_addr().unwrap(); + let connection: BrowserConnection = + serde_json::from_slice(&req.bytes().await.unwrap_or_default()) + .unwrap_or_default(); + + if !connection.web_socket_debugger_url.is_empty() { + // prevent proxy interfaces from returning local ips to connect to the exact machine + debug_ws_url = connection + .web_socket_debugger_url + .replace("127.0.0.1", &socketaddr.ip().to_string()); + } + } + Err(_) => return Err(CdpError::NoResponse), + } + } + let conn = Connection::::connect(&debug_ws_url).await?; let (tx, rx) = channel(1); diff --git a/src/detection.rs b/src/detection.rs index 5e27b35e..6642ad07 100644 --- a/src/detection.rs +++ b/src/detection.rs @@ -89,7 +89,12 @@ fn get_by_name(options: &DetectionOptions) -> Option { #[allow(unused_variables)] fn get_by_path(options: &DetectionOptions) -> Option { #[cfg(all(unix, not(target_os = "macos")))] - let default_paths: [(&str, bool); 0] = []; + let default_paths: [(&str, bool); 3] = [ + ("/opt/chromium.org/chromium", true), + ("/opt/google/chrome", true), + // test for lambda + ("/tmp/aws/lib", true), + ]; #[cfg(windows)] let default_paths = [( r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe", diff --git a/src/listeners.rs b/src/listeners.rs index 6dcd3e39..ac482d48 100644 --- a/src/listeners.rs +++ b/src/listeners.rs @@ -26,7 +26,7 @@ impl EventListeners { method, kind, } = req; - let subs = self.listeners.entry(method).or_insert_with(Vec::new); + let subs = self.listeners.entry(method).or_default(); subs.push(EventListener { listener, kind, diff --git a/src/page.rs b/src/page.rs index 1d9b6d5b..86c8823b 100644 --- a/src/page.rs +++ b/src/page.rs @@ -43,20 +43,40 @@ pub struct Page { } impl Page { - /// Changes your user_agent, removes the `navigator.webdriver` property + /// Removes the `navigator.webdriver` property /// changes permissions, pluggins rendering contexts and the `window.chrome` /// property to make it harder to detect the scraper as a bot - pub async fn enable_stealth_mode(&self) -> Result<()> { + async fn _enable_stealth_mode(&self) -> Result<()> { self.hide_webdriver().await?; self.hide_permissions().await?; self.hide_plugins().await?; self.hide_webgl_vendor().await?; self.hide_chrome().await?; + + Ok(()) + } + + /// Changes your user_agent, removes the `navigator.webdriver` property + /// changes permissions, pluggins rendering contexts and the `window.chrome` + /// property to make it harder to detect the scraper as a bot + pub async fn enable_stealth_mode(&self) -> Result<()> { + self._enable_stealth_mode().await?; self.set_user_agent("Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5296.0 Safari/537.36").await?; Ok(()) } + /// Changes your user_agent with a custom agent, removes the `navigator.webdriver` property + /// changes permissions, pluggins rendering contexts and the `window.chrome` + /// property to make it harder to detect the scraper as a bot + pub async fn enable_stealth_mode_with_agent(&self, ua: &str) -> Result<()> { + self._enable_stealth_mode().await?; + if !ua.is_empty() { + self.set_user_agent(ua).await?; + } + Ok(()) + } + /// Sets `window.chrome` on frame creation async fn hide_chrome(&self) -> Result<(), CdpError> { self.execute(AddScriptToEvaluateOnNewDocumentParams { @@ -1088,6 +1108,27 @@ impl Page { .into_value()?) } + #[cfg(feature = "bytes")] + /// Returns the HTML content of the page + pub async fn content_bytes(&self) -> Result { + Ok(self + .evaluate( + "{ + let retVal = ''; + if (document.doctype) { + retVal = new XMLSerializer().serializeToString(document.doctype); + } + if (document.documentElement) { + retVal += document.documentElement.outerHTML; + } + retVal + } + ", + ) + .await? + .into_value()?) + } + /// Returns source for the script with given id. /// /// Debugger must be enabled.