Skip to content

Commit

Permalink
Introduce --root-dir (#1576)
Browse files Browse the repository at this point in the history
* windows

* Introduce --root-path

* lint

* lint

* Simplification

* Add unit tests

* Add integration test

* Sync docs

* Add missing comment to make CI happy

* Revert one of the Windows-specific changes because causing a test failure

* Support both options at the same time

* Revert a comment change that is no longer applicable

* Remove unused code

* Fix and simplification

* Integration test both at the same time

* Unit tests both at the same time

* Remove now redundant comment

* Revert windows-specific change, seems not needed after recent changes

* Use Collector::default()

* extract method and unit tests

* clippy

* clippy: &Option<A> -> Option<&A>

* Remove outdated comment

* Rename --root-path to --root-dir

* Restrict --root-dir to absolute paths for now

* Move root dir check
  • Loading branch information
trask authored Dec 13, 2024
1 parent 685b653 commit 6d0e94c
Show file tree
Hide file tree
Showing 16 changed files with 431 additions and 190 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,9 @@ Options:
-b, --base <BASE>
Base URL or website root directory to check relative URLs e.g. <https://example.com> or `/path/to/public`
--root-dir <ROOT_DIR>
Root path to use when checking absolute local links, must be an absolute path
--basic-auth <BASIC_AUTH>
Basic authentication support. E.g. `http://example.com username:password`
Expand Down
2 changes: 1 addition & 1 deletion examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async fn main() -> Result<()> {
},
];

let links = Collector::new(None) // base
let links = Collector::default()
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
.skip_hidden(false) // skip hidden files? (default=true)
.skip_ignored(false) // skip files that are ignored by git? (default=true)
Expand Down
8 changes: 8 additions & 0 deletions fixtures/resolve_paths_from_root_dir/nested/about/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<html>
<head>
<title>About</title>
</head>
<body>
<h1 id="fragment">About</h1>
</body>
</html>
Empty file.
34 changes: 34 additions & 0 deletions fixtures/resolve_paths_from_root_dir/nested/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<html>
<head>
<title>Index</title>
</head>
<body>
<h1>Index Title</h1>
<a id="good"></a>
<p>
<ul>
<li>
<a href="/nested">home</a>
</li>
<li>
<a href="/nested/about">About</a>
</li>
<li>
<a href="/nested/another page">About</a>
</li>
<li>
<a href="/nested/about/index.html#fragment">Fragment</a>
</li>
<li>
<a href="/nested/about/index.html#missing">Missing</a>
</li>
<li>
<a href="#good">Good</a>
</li>
<li>
<a href="#bad">Bad</a>
</li>
</ul>
</p>
</body>
</html>
8 changes: 4 additions & 4 deletions lychee-bin/src/commands/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ async fn progress_bar_task(
while let Some(response) = recv_resp.recv().await {
show_progress(
&mut io::stderr(),
&pb,
pb.as_ref(),
&response,
formatter.as_ref(),
&verbose,
Expand Down Expand Up @@ -331,7 +331,7 @@ fn ignore_cache(uri: &Uri, status: &Status, cache_exclude_status: &HashSet<u16>)

fn show_progress(
output: &mut dyn Write,
progress_bar: &Option<ProgressBar>,
progress_bar: Option<&ProgressBar>,
response: &Response,
formatter: &dyn ResponseFormatter,
verbose: &Verbosity,
Expand Down Expand Up @@ -401,7 +401,7 @@ mod tests {
let formatter = get_response_formatter(&options::OutputMode::Plain);
show_progress(
&mut buf,
&None,
None,
&response,
formatter.as_ref(),
&Verbosity::default(),
Expand All @@ -423,7 +423,7 @@ mod tests {
let formatter = get_response_formatter(&options::OutputMode::Plain);
show_progress(
&mut buf,
&None,
None,
&response,
formatter.as_ref(),
&Verbosity::debug(),
Expand Down
2 changes: 1 addition & 1 deletion lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ fn underlying_io_error_kind(error: &Error) -> Option<io::ErrorKind> {
async fn run(opts: &LycheeOptions) -> Result<i32> {
let inputs = opts.inputs()?;

let mut collector = Collector::new(opts.config.base.clone())
let mut collector = Collector::new(opts.config.root_dir.clone(), opts.config.base.clone())?
.skip_missing_inputs(opts.config.skip_missing)
.skip_hidden(!opts.config.hidden)
.skip_ignored(!opts.config.no_ignore)
Expand Down
6 changes: 6 additions & 0 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,12 @@ separated list of accepted status codes. This example will accept 200, 201,
#[serde(default)]
pub(crate) base: Option<Base>,

/// Root path to use when checking absolute local links,
/// must be an absolute path
#[arg(long)]
#[serde(default)]
pub(crate) root_dir: Option<PathBuf>,

/// Basic authentication support. E.g. `http://example.com username:password`
#[arg(long)]
#[serde(default)]
Expand Down
36 changes: 36 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,42 @@ mod cli {
.stdout(contains("3 OK"));
}

#[test]
fn test_resolve_paths_from_root_dir() {
let mut cmd = main_command();
let dir = fixtures_path().join("resolve_paths_from_root_dir");

cmd.arg("--offline")
.arg("--include-fragments")
.arg("--root-dir")
.arg(&dir)
.arg(dir.join("nested").join("index.html"))
.env_clear()
.assert()
.failure()
.stdout(contains("7 Total"))
.stdout(contains("5 OK"))
.stdout(contains("2 Errors"));
}

#[test]
fn test_resolve_paths_from_root_dir_and_base_url() {
let mut cmd = main_command();
let dir = fixtures_path();

cmd.arg("--offline")
.arg("--root-dir")
.arg("/resolve_paths")
.arg("--base")
.arg(&dir)
.arg(dir.join("resolve_paths").join("index.html"))
.env_clear()
.assert()
.success()
.stdout(contains("3 Total"))
.stdout(contains("3 OK"));
}

#[test]
fn test_youtube_quirk() {
let url = "https://www.youtube.com/watch?v=NlKuICiT470&list=PLbWDhxwM_45mPVToqaIZNbZeIzFchsKKQ&index=7";
Expand Down
84 changes: 63 additions & 21 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::ErrorKind;
use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Expand All @@ -9,6 +10,7 @@ use futures::{
StreamExt,
};
use par_stream::ParStreamExt;
use std::path::PathBuf;

/// Collector keeps the state of link collection
/// It drives the link extraction from inputs
Expand All @@ -21,22 +23,47 @@ pub struct Collector {
skip_hidden: bool,
include_verbatim: bool,
use_html5ever: bool,
root_dir: Option<PathBuf>,
base: Option<Base>,
}

impl Default for Collector {
fn default() -> Self {
Collector {
basic_auth_extractor: None,
skip_missing_inputs: false,
include_verbatim: false,
use_html5ever: false,
skip_hidden: true,
skip_ignored: true,
root_dir: None,
base: None,
}
}
}

impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub const fn new(base: Option<Base>) -> Self {
Collector {
///
/// # Errors
///
/// Returns an `Err` if the `root_dir` is not an absolute path
pub fn new(root_dir: Option<PathBuf>, base: Option<Base>) -> Result<Self> {
if let Some(root_dir) = &root_dir {
if root_dir.is_relative() {
return Err(ErrorKind::RootDirMustBeAbsolute(root_dir.clone()));
}
}
Ok(Collector {
basic_auth_extractor: None,
skip_missing_inputs: false,
include_verbatim: false,
use_html5ever: false,
skip_hidden: true,
skip_ignored: true,
root_dir,
base,
}
})
}

/// Skip missing input files (default is to error if they don't exist)
Expand Down Expand Up @@ -119,12 +146,19 @@ impl Collector {
})
.flatten()
.par_then_unordered(None, move |(content, base)| {
let root_dir = self.root_dir.clone();
let basic_auth_extractor = self.basic_auth_extractor.clone();
async move {
let content = content?;
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec<RawUri> = extractor.extract(&content);
let requests = request::create(uris, &content, &base, &basic_auth_extractor);
let requests = request::create(
uris,
&content.source,
root_dir.as_ref(),
base.as_ref(),
basic_auth_extractor.as_ref(),
);
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
})
Expand All @@ -148,17 +182,25 @@ mod tests {
};

// Helper function to run the collector on the given inputs
async fn collect(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base).collect_links(inputs);
responses.map(|r| r.unwrap().uri).collect().await
async fn collect(
inputs: Vec<Input>,
root_dir: Option<PathBuf>,
base: Option<Base>,
) -> Result<HashSet<Uri>> {
let responses = Collector::new(root_dir, base)?.collect_links(inputs);
Ok(responses.map(|r| r.unwrap().uri).collect().await)
}

// Helper function for collecting verbatim links
async fn collect_verbatim(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base)
async fn collect_verbatim(
inputs: Vec<Input>,
root_dir: Option<PathBuf>,
base: Option<Base>,
) -> Result<HashSet<Uri>> {
let responses = Collector::new(root_dir, base)?
.include_verbatim(true)
.collect_links(inputs);
responses.map(|r| r.unwrap().uri).collect().await
Ok(responses.map(|r| r.unwrap().uri).collect().await)
}

const TEST_STRING: &str = "http://test-string.com";
Expand Down Expand Up @@ -246,7 +288,7 @@ mod tests {
},
];

let links = collect_verbatim(inputs, None).await;
let links = collect_verbatim(inputs, None, None).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website(TEST_STRING),
Expand All @@ -269,7 +311,7 @@ mod tests {
file_type_hint: Some(FileType::Markdown),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://endler.dev"),
Expand All @@ -295,7 +337,7 @@ mod tests {
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://github.com/lycheeverse/lychee/"),
Expand Down Expand Up @@ -324,7 +366,7 @@ mod tests {
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://example.com/static/image.png"),
Expand All @@ -351,7 +393,7 @@ mod tests {
excluded_paths: None,
};

let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected = HashSet::from_iter([
website("https://localhost.com/@/internal.md"),
Expand All @@ -373,7 +415,7 @@ mod tests {
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
// the body links wouldn't be present if the file was parsed strictly as XML
Expand Down Expand Up @@ -406,7 +448,7 @@ mod tests {
excluded_paths: None,
};

let links = collect(vec![input], None).await;
let links = collect(vec![input], None, None).await.ok().unwrap();

let expected_urls = HashSet::from_iter([
website("https://github.com/lycheeverse/lychee/"),
Expand All @@ -425,7 +467,7 @@ mod tests {
file_type_hint: None,
excluded_paths: None,
};
let links = collect(vec![input], None).await;
let links = collect(vec![input], None, None).await.ok().unwrap();

let expected_links = HashSet::from_iter([mail("[email protected]")]);

Expand Down Expand Up @@ -468,7 +510,7 @@ mod tests {
},
];

let links = collect(inputs, None).await;
let links = collect(inputs, None, None).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website(&format!(
Expand Down Expand Up @@ -502,7 +544,7 @@ mod tests {
excluded_paths: None,
};

let links = collect(vec![input], Some(base)).await;
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
path("/path/to/root/index.html"),
Expand Down
1 change: 1 addition & 0 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub mod remap;
/// local IPs or e-mail addresses
pub mod filter;

/// Test utilities
#[cfg(test)]
#[macro_use]
pub mod test_utils;
Expand Down
9 changes: 0 additions & 9 deletions lychee-lib/src/types/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,6 @@ impl Base {
}
}

/// Return the directory if the base is local
#[must_use]
pub(crate) fn dir(&self) -> Option<PathBuf> {
match self {
Self::Remote(_) => None,
Self::Local(d) => Some(d.clone()),
}
}

pub(crate) fn from_source(source: &InputSource) -> Option<Base> {
match &source {
InputSource::RemoteUrl(url) => {
Expand Down
Loading

0 comments on commit 6d0e94c

Please sign in to comment.