Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto find websocket address and handle bytes #188

Merged
merged 17 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ tokio = { version = "1", features = [
tracing = "0.1"
pin-project-lite = "0.2"
dunce = "1"
bytes = { version = "1.4.0", features = ["serde"], optional = true }
reqwest = { version = "0.11.20", features = [ "brotli", "gzip", "deflate", "native-tls-alpn", "stream" ] }

[target.'cfg(windows)'.dependencies]
winreg = "0.50"
Expand All @@ -54,10 +56,11 @@ tracing-subscriber = "0.3"
tokio = { version = "1", features = ["rt-multi-thread", "time", "macros"] }

[features]
default = ["async-std-runtime"]
default = ["async-std-runtime", "bytes"]
async-std-runtime = ["async-std", "async-tungstenite/async-std-runtime"]
tokio-runtime = ["tokio", "async-tungstenite/tokio-runtime"]
fetcher = []
bytes = ["dep:bytes"]

# Temporary features until cargo weak dependencies bug is fixed
# See https://github.com/rust-lang/cargo/issues/10801
Expand Down
2 changes: 1 addition & 1 deletion examples/block-navigation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Navigate to target
page.goto("http://google.com").await?;
let content = page.content().await?;
println!("Content: {content}");
println!("Content: {:?}", content);

browser.close().await?;
browser_handle.await;
Expand Down
91 changes: 70 additions & 21 deletions src/browser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,33 +51,82 @@ pub struct Browser {
browser_context: BrowserContext,
}

impl Browser {
/// Connect to an already running chromium instance via websocket
pub async fn connect(debug_ws_url: impl Into<String>) -> Result<(Self, Handler)> {
let debug_ws_url = debug_ws_url.into();
let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;

let (tx, rx) = channel(1);

let fut = Handler::new(conn, rx, HandlerConfig::default());
let browser_context = fut.default_browser_context().clone();
/// Browser connection information.
#[derive(serde::Deserialize, Debug, Default)]
pub struct BrowserConnection {
#[serde(rename = "Browser")]
/// The browser name
pub browser: String,
#[serde(rename = "Protocol-Version")]
/// Browser version
pub protocol_version: String,
#[serde(rename = "User-Agent")]
/// User Agent used by default.
pub user_agent: String,
#[serde(rename = "V8-Version")]
/// The v8 engine version
pub v8_version: String,
#[serde(rename = "WebKit-Version")]
/// Webkit version
pub webkit_version: String,
#[serde(rename = "webSocketDebuggerUrl")]
/// Remote debugging address
pub web_socket_debugger_url: String,
}

let browser = Self {
sender: tx,
config: None,
child: None,
debug_ws_url,
browser_context,
};
Ok((browser, fut))
impl Browser {
/// Connect to an already running chromium instance via the given URL.
///
/// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
pub async fn connect(url: impl Into<String>) -> Result<(Self, Handler)> {
Self::connect_with_config(url, HandlerConfig::default()).await
}

// Connect to an already running chromium instance via websocket with HandlerConfig
// Connect to an already running chromium instance with a given `HandlerConfig`.
///
/// If the URL is a http(s) URL, it will first attempt to retrieve the Websocket URL from the `json/version` endpoint.
pub async fn connect_with_config(
debug_ws_url: impl Into<String>,
url: impl Into<String>,
config: HandlerConfig,
) -> Result<(Self, Handler)> {
let debug_ws_url = debug_ws_url.into();
let mut debug_ws_url = url.into();

if debug_ws_url.starts_with("http") {
match reqwest::Client::new()
.get(
if debug_ws_url.ends_with("/json/version")
|| debug_ws_url.ends_with("/json/version/")
{
debug_ws_url.clone()
} else {
format!(
"{}{}json/version",
&debug_ws_url,
if debug_ws_url.ends_with('/') { "" } else { "/" }
)
},
)
.header("content-type", "application/json")
.send()
.await
{
Ok(req) => {
let socketaddr = req.remote_addr().unwrap();
let connection: BrowserConnection =
serde_json::from_slice(&req.bytes().await.unwrap_or_default())
.unwrap_or_default();

if !connection.web_socket_debugger_url.is_empty() {
// prevent proxy interfaces from returning local ips to connect to the exact machine
debug_ws_url = connection
.web_socket_debugger_url
.replace("127.0.0.1", &socketaddr.ip().to_string());
}
}
Err(_) => return Err(CdpError::NoResponse),
}
}

let conn = Connection::<CdpEventMessage>::connect(&debug_ws_url).await?;

let (tx, rx) = channel(1);
Expand Down
7 changes: 6 additions & 1 deletion src/detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,12 @@ fn get_by_name(options: &DetectionOptions) -> Option<PathBuf> {
#[allow(unused_variables)]
fn get_by_path(options: &DetectionOptions) -> Option<PathBuf> {
#[cfg(all(unix, not(target_os = "macos")))]
let default_paths: [(&str, bool); 0] = [];
let default_paths: [(&str, bool); 3] = [
("/opt/chromium.org/chromium", true),
("/opt/google/chrome", true),
// test for lambda
("/tmp/aws/lib", true),
];
#[cfg(windows)]
let default_paths = [(
r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
Expand Down
2 changes: 1 addition & 1 deletion src/listeners.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ impl EventListeners {
method,
kind,
} = req;
let subs = self.listeners.entry(method).or_insert_with(Vec::new);
let subs = self.listeners.entry(method).or_default();
subs.push(EventListener {
listener,
kind,
Expand Down
45 changes: 43 additions & 2 deletions src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,40 @@ pub struct Page {
}

impl Page {
/// Changes your user_agent, removes the `navigator.webdriver` property
/// Removes the `navigator.webdriver` property
/// changes permissions, pluggins rendering contexts and the `window.chrome`
/// property to make it harder to detect the scraper as a bot
pub async fn enable_stealth_mode(&self) -> Result<()> {
async fn _enable_stealth_mode(&self) -> Result<()> {
self.hide_webdriver().await?;
self.hide_permissions().await?;
self.hide_plugins().await?;
self.hide_webgl_vendor().await?;
self.hide_chrome().await?;

Ok(())
}

/// Changes your user_agent, removes the `navigator.webdriver` property
/// changes permissions, pluggins rendering contexts and the `window.chrome`
/// property to make it harder to detect the scraper as a bot
pub async fn enable_stealth_mode(&self) -> Result<()> {
self._enable_stealth_mode().await?;
self.set_user_agent("Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.5296.0 Safari/537.36").await?;

Ok(())
}

/// Changes your user_agent with a custom agent, removes the `navigator.webdriver` property
/// changes permissions, pluggins rendering contexts and the `window.chrome`
/// property to make it harder to detect the scraper as a bot
pub async fn enable_stealth_mode_with_agent(&self, ua: &str) -> Result<()> {
self._enable_stealth_mode().await?;
if !ua.is_empty() {
self.set_user_agent(ua).await?;
}
Ok(())
}

/// Sets `window.chrome` on frame creation
async fn hide_chrome(&self) -> Result<(), CdpError> {
self.execute(AddScriptToEvaluateOnNewDocumentParams {
Expand Down Expand Up @@ -1088,6 +1108,27 @@ impl Page {
.into_value()?)
}

#[cfg(feature = "bytes")]
/// Returns the HTML content of the page
pub async fn content_bytes(&self) -> Result<bytes::Bytes> {
Ok(self
.evaluate(
"{
let retVal = '';
if (document.doctype) {
retVal = new XMLSerializer().serializeToString(document.doctype);
}
if (document.documentElement) {
retVal += document.documentElement.outerHTML;
}
retVal
}
",
)
.await?
.into_value()?)
}

/// Returns source for the script with given id.
///
/// Debugger must be enabled.
Expand Down
Loading