From d6be9951ae2ca31cd4332108de955f1bfff0a1ba Mon Sep 17 00:00:00 2001 From: Alec Smrekar Date: Tue, 12 Dec 2023 16:37:51 +0100 Subject: [PATCH] Decode the HTML before loading static assets --- Cargo.toml | 1 + src/lib.rs | 5 +++-- tests/parse.rs | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 tests/parse.rs diff --git a/Cargo.toml b/Cargo.toml index dd779cf..7f371c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ license = "Apache-2.0" [dependencies] goose = { version = "0.17", default-features = false } +html-escape = "0.2" http = "0.2" log = "0.4" rand = "0.8" diff --git a/src/lib.rs b/src/lib.rs index c33c7a8..263783c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ //! * `default`: use the native TLS implementation for `goose` and `reqwest` //! * `rustls-tls`: use the TLS implemenation provided by `rustls` +use std::borrow::Cow; use goose::goose::GooseResponse; use goose::prelude::*; use http::Uri; @@ -993,7 +994,7 @@ pub async fn get_src_elements(user: &mut GooseUser, html: &str) -> Vec { // @TODO: parse HTML5 srcset= also let src_elements = Regex::new(r#"(?i)src="(.*?)""#).unwrap(); let mut elements: Vec = Vec::new(); - for url in src_elements.captures_iter(html) { + for url in src_elements.captures_iter(html_escape::decode_html_entities(html).as_ref()) { if valid_local_uri(user, &url[1]) { elements.push(url[1].to_string()); } @@ -1010,7 +1011,7 @@ pub async fn get_css_elements(user: &mut GooseUser, html: &str) -> Vec { // is the URL to local css assets. let css = Regex::new(r#"(?i)href="(.*?\.css.*?)""#).unwrap(); let mut elements: Vec = Vec::new(); - for url in css.captures_iter(html) { + for url in css.captures_iter(html_escape::decode_html_entities(html).as_ref()) { if valid_local_uri(user, &url[1]) { elements.push(url[1].to_string()); } diff --git a/tests/parse.rs b/tests/parse.rs new file mode 100644 index 0000000..535f473 --- /dev/null +++ b/tests/parse.rs @@ -0,0 +1,53 @@ +use gumdrop::Options; +use httpmock::{Method::GET, MockServer}; + +use goose::config::GooseConfiguration; +use goose::goose::get_base_url; +use goose::metrics::GooseCoordinatedOmissionMitigation::Disabled; +use goose::prelude::*; +use goose_eggs::load_static_elements; + +#[tokio::test] +// Loads static elements and checks that characters are decoded properly. +async fn test_html_decoding() { + let html: &str = r#" + + + + + + + Title 1234ABCD + + +

Test text on the page.

+ + "#; + + let server = MockServer::start(); + + let mock_endpoint1 = server.mock(|when, then| { + when.method(GET) + .path("/test1.js") + .query_param("foo", "1") + .query_param("bar", "2"); + then.status(200).body("test"); + }); + let mock_endpoint2 = server.mock(|when, then| { + when.method(GET) + .path("/test2.js") + .query_param("foo", "1") + .query_param("bar", "2"); + then.status(200).body("test"); + }); + + let config: Vec<&str> = vec![]; + let mut configuration = GooseConfiguration::parse_args_default(&config).unwrap(); + configuration.co_mitigation = Some(Disabled); + let base_url = get_base_url(Some(server.base_url()), None, None).unwrap(); + let mut user = GooseUser::new(0, "".to_string(), base_url, &configuration, 0, None).unwrap(); + + load_static_elements(&mut user, html).await; + assert_eq!(mock_endpoint1.hits(), 1); + assert_eq!(mock_endpoint2.hits(), 1); +}