From 8ed2ec1c57fa98852d5f6692972d966a502368b9 Mon Sep 17 00:00:00 2001 From: Maverick Liu Date: Sat, 22 Feb 2025 16:28:44 +0800 Subject: [PATCH] fix post process --- src/lib.rs | 82 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ce20dbc..bd512ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,7 +25,11 @@ pub fn convert_epub_to_mdbook( return Err(Error::NotAFile(epub_path.display().to_string())); } let book_name = epub_path.with_extension(""); - let book_name = book_name.file_name().unwrap().to_string_lossy().to_string(); + let book_name = book_name + .file_name() + .expect("unreachable") + .to_string_lossy() + .to_string(); let output_dir = match output_dir { Some(output_dir) => output_dir.as_ref().join(&book_name), None => PathBuf::from(".").join(&book_name), @@ -101,14 +105,6 @@ pub fn toc_to_md( (summary_md, html_to_md) } -/// Capture the `{link}` without `#`, eg: -/// ``` -/// [ABC]({abc.html}#xxx) -/// [ABC]({abc.html}) -/// ``` -static LINK_REGEX: LazyLock = - LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).unwrap()); - fn extract_chapters_and_resources( doc: &mut EpubDoc, output_dir: impl AsRef, @@ -121,27 +117,15 @@ fn extract_chapters_and_resources( Some(content) => content, None => continue, // unreachable }; - if let Some(path) = html_to_md.get(&path) { + if let Some(md_path) = html_to_md.get(&path) { // html file, convert to md - let target_path = src_dir.join(path); + let target_path = src_dir.join(md_path); if let Some(parent) = target_path.parent() { fs::create_dir_all(parent)?; } - let html = String::from_utf8(content)?; - let markdown = LINK_REGEX - .replace_all(&html2md::parse_html(&html), |caps: &Captures| { - // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) - let origin = &caps[0]; - let link = &caps[1]; - if let Some(md_path) = html_to_md.get(&PathBuf::from(link)) { - let md_path = md_path.to_string_lossy().to_string(); - origin.replace(link, &md_path) - } else { - origin.to_string() - } - }) - .replace(r"![]()", "") - .replace(r"[]()", ""); + let html = String::from_utf8(content.clone())?; + let markdown = html2md::parse_html(&html); + let markdown = post_process_md(&markdown, html_to_md); fs::write(target_path, markdown)?; } else { // other file, just copy @@ -155,6 +139,52 @@ fn extract_chapters_and_resources( Ok(()) } +/// Capture the `{link}` without `#`, eg: +/// ``` +/// [ABC]({abc.html}#xxx) +/// [ABC]({abc.html}) +/// ``` +static LINK_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable")); + +/// Capture the empty links, eg: +/// ``` +/// [{ABC}]() +/// ``` +static LINK_NAME_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r#"\[([^\]]+)\]\(\)"#).expect("unreachable")); + +fn post_process_md(markdown: &str, html_to_md: &HashMap) -> String { + let file_name_map = html_to_md + .iter() + .filter_map(|(k, v)| Some((k.file_name()?, v.file_name()?))) + .collect::>(); + + let markdown = LINK_REGEX + .replace_all(markdown, |caps: &Captures| { + // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) + let origin = &caps[0]; + let link = match Path::new(&caps[1]).file_name() { + Some(link) => link, + None => return origin.to_string(), + }; + if let Some(md_path) = file_name_map.get(link) { + origin.replace( + &link.to_string_lossy().to_string(), + &md_path.to_string_lossy(), + ) + } else { + origin.to_string() + } + }) + .replace(r"![]()", "") + .replace(r"[]()", ""); + + LINK_NAME_REGEX + .replace_all(&markdown, |caps: &Captures| caps[1].to_string()) + .to_string() +} + fn write_book_toml( output_dir: impl AsRef, title: &str,