From a84969da20ac25561fec402fe9324642e0415b40 Mon Sep 17 00:00:00 2001 From: Maverick Liu Date: Sun, 23 Feb 2025 14:17:11 +0800 Subject: [PATCH] use htmd instead of html2md --- Cargo.lock | 90 +++-------------------------------------------------- Cargo.toml | 4 +-- src/lib.rs | 87 +++++++++++++++++++++++++-------------------------- src/main.rs | 2 +- 4 files changed, 50 insertions(+), 133 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75afa28..0d5f8ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,18 +88,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" -[[package]] -name = "bytes" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" - -[[package]] -name = "cesu8" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" - [[package]] name = "cfg-if" version = "1.0.0" @@ -152,16 +140,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" -[[package]] -name = "combine" -version = "4.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" -dependencies = [ - "bytes", - "memchr", -] - [[package]] name = "crc32fast" version = "1.4.2" @@ -214,11 +192,11 @@ dependencies = [ [[package]] name = "epub2mdbook" -version = "0.7.0" +version = "0.8.0" dependencies = [ "clap", "epub", - "html2md", + "htmd", "regex", "thiserror 2.0.11", ] @@ -262,17 +240,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "html2md" -version = "0.2.15" +name = "htmd" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cff9891f2e0d9048927fbdfc28b11bf378f6a93c7ba70b23d0fbee9af6071b4" +checksum = "ad1642def6e8e4dc182941f35454f7d2af917787f91f3f5133300030b41006d0" dependencies = [ "html5ever", - "jni", - "lazy_static", "markup5ever_rcdom", - "percent-encoding", - "regex", ] [[package]] @@ -305,32 +279,6 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" -[[package]] -name = "jni" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec" -dependencies = [ - "cesu8", - "combine", - "jni-sys", - "log", - "thiserror 1.0.69", - "walkdir", -] - -[[package]] -name = "jni-sys" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "libc" version = "0.2.169" @@ -586,15 +534,6 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - [[package]] name = "scopeguard" version = "1.2.0" @@ -761,25 +700,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "winapi-util" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" -dependencies = [ - "windows-sys", -] - [[package]] name = "windows-sys" version = "0.59.0" diff --git a/Cargo.toml b/Cargo.toml index cbea057..d72e12c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "epub2mdbook" -version = "0.7.0" +version = "0.8.0" edition = "2024" description = "A tool to convert EPUB files to MDBook format" authors = ["Maverick Liu "] @@ -12,6 +12,6 @@ categories = ["command-line-utilities", "text-processing"] [dependencies] clap = { version = "4.5.30", features = ["derive"] } epub = "2.1.1" -html2md = "0.2.15" +htmd = "0.1.6" regex = "1.11.1" thiserror = "2.0.11" diff --git a/src/lib.rs b/src/lib.rs index cb2f4bd..a6cce97 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,32 +115,29 @@ fn extract_chapters_and_resources( let output_dir = output_dir.as_ref(); let src_dir = output_dir.join("src"); for (_, (path, _)) in doc.resources.clone().into_iter() { - let content = match doc.get_resource_by_path(&path) { + let mut content = match doc.get_resource_by_path(&path) { Some(content) => content, None => continue, // unreachable }; - if let Some(md_path) = html_to_md.get(&path) { + let target_path = if let Some(md_path) = html_to_md.get(&path) { // html file, convert to md - let target_path = if md_path == Path::new("SUMMARY.md") { + let html = String::from_utf8(content.clone())?; + let markdown = htmd::convert(&html)?; + content = post_process_md(&markdown, &file_name_map).into_bytes(); + if md_path == Path::new("SUMMARY.md") { src_dir.join("_SUMMARY.md") } else { src_dir.join(md_path) - }; - if let Some(parent) = target_path.parent() { - fs::create_dir_all(parent)?; } - let html = String::from_utf8(content)?; - let markdown = html2md::parse_html(&html); - let markdown = post_process_md(&markdown, &file_name_map); - fs::write(target_path, markdown)?; } else { // other file, just copy - let target_path = src_dir.join(&path); - if let Some(parent) = target_path.parent() { - fs::create_dir_all(parent)?; - } - fs::write(target_path, content)?; + src_dir.join(&path) + }; + // write to target path + if let Some(parent) = target_path.parent() { + fs::create_dir_all(parent)?; } + fs::write(target_path, content)?; } Ok(()) } @@ -150,39 +147,39 @@ fn extract_chapters_and_resources( /// [ABC]({abc.html}#xxx) /// [ABC]({abc.html}) /// ``` -static LINK: LazyLock = - LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable")); -static EMPTY_LINK: LazyLock = - LazyLock::new(|| Regex::new(r#"\[([^\]]+)\]\(\)"#).expect("unreachable")); +static LINK: LazyLock = LazyLock::new(|| { + Regex::new(r#"\[[^\]]+\]\(?P([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable") +}); +/// Match the URL link, eg: +/// ``` +/// https://www.example.com\ +/// ``` static URL_LINK: LazyLock = LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").expect("unreachable")); -fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String { - let markdown = LINK - .replace_all(markdown, |caps: &Captures| { - // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) - let origin = &caps[0]; - let link = &caps[1]; - // Don't modify links with schemes like `https`. - if URL_LINK.is_match(link) { - return origin.to_string(); - } - let html_file_name = match Path::new(&link).file_name() { - Some(link) => link, - None => return origin.to_string(), - }; - if let Some(md_file_name) = file_name_map.get(html_file_name) { - origin.replace( - &html_file_name.to_string_lossy().to_string(), - &md_file_name.to_string_lossy(), - ) - } else { - origin.to_string() - } - }) - .replace(r"![]()", "") - .replace(r"[]()", ""); - EMPTY_LINK.replace_all(&markdown, "$1").to_string() +fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String { + LINK.replace_all(markdown, |caps: &Captures| { + // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) + let origin = &caps[0]; + let link = &caps["link"]; + // Don't modify links with schemes like `https`. + if URL_LINK.is_match(link) { + return origin.to_string(); + } + let html_file_name = match Path::new(&link).file_name() { + Some(link) => link, + None => return origin.to_string(), + }; + if let Some(md_file_name) = file_name_map.get(html_file_name) { + origin.replace( + &*html_file_name.to_string_lossy(), + &md_file_name.to_string_lossy(), + ) + } else { + origin.to_string() + } + }) + .to_string() } fn write_book_toml( diff --git a/src/main.rs b/src/main.rs index a31d36f..3f114bf 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,4 +18,4 @@ fn main() -> Result<(), Error> { convert_epub_to_mdbook(args.input_epub, args.output_dir)?; println!("Conversion completed successfully!"); Ok(()) -} +} \ No newline at end of file