mirror of
https://github.com/cyborg42/epub2mdbook.git
synced 2026-02-10 16:24:50 -05:00
use htmd instead of html2md
This commit is contained in:
parent
67637842fb
commit
a84969da20
4 changed files with 50 additions and 133 deletions
90
Cargo.lock
generated
90
Cargo.lock
generated
|
|
@ -88,18 +88,6 @@ version = "2.8.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
|
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "bytes"
|
|
||||||
version = "1.10.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cesu8"
|
|
||||||
version = "1.1.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
|
|
@ -152,16 +140,6 @@ version = "1.0.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "combine"
|
|
||||||
version = "4.6.7"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd"
|
|
||||||
dependencies = [
|
|
||||||
"bytes",
|
|
||||||
"memchr",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crc32fast"
|
name = "crc32fast"
|
||||||
version = "1.4.2"
|
version = "1.4.2"
|
||||||
|
|
@ -214,11 +192,11 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "epub2mdbook"
|
name = "epub2mdbook"
|
||||||
version = "0.7.0"
|
version = "0.8.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"epub",
|
"epub",
|
||||||
"html2md",
|
"htmd",
|
||||||
"regex",
|
"regex",
|
||||||
"thiserror 2.0.11",
|
"thiserror 2.0.11",
|
||||||
]
|
]
|
||||||
|
|
@ -262,17 +240,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "html2md"
|
name = "htmd"
|
||||||
version = "0.2.15"
|
version = "0.1.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8cff9891f2e0d9048927fbdfc28b11bf378f6a93c7ba70b23d0fbee9af6071b4"
|
checksum = "ad1642def6e8e4dc182941f35454f7d2af917787f91f3f5133300030b41006d0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"html5ever",
|
"html5ever",
|
||||||
"jni",
|
|
||||||
"lazy_static",
|
|
||||||
"markup5ever_rcdom",
|
"markup5ever_rcdom",
|
||||||
"percent-encoding",
|
|
||||||
"regex",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
@ -305,32 +279,6 @@ version = "1.70.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "jni"
|
|
||||||
version = "0.19.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec"
|
|
||||||
dependencies = [
|
|
||||||
"cesu8",
|
|
||||||
"combine",
|
|
||||||
"jni-sys",
|
|
||||||
"log",
|
|
||||||
"thiserror 1.0.69",
|
|
||||||
"walkdir",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "jni-sys"
|
|
||||||
version = "0.3.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lazy_static"
|
|
||||||
version = "1.5.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.169"
|
version = "0.2.169"
|
||||||
|
|
@ -586,15 +534,6 @@ version = "0.8.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "same-file"
|
|
||||||
version = "1.0.6"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
|
||||||
dependencies = [
|
|
||||||
"winapi-util",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "scopeguard"
|
name = "scopeguard"
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
|
|
@ -761,25 +700,6 @@ version = "0.2.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "walkdir"
|
|
||||||
version = "2.5.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
|
|
||||||
dependencies = [
|
|
||||||
"same-file",
|
|
||||||
"winapi-util",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "winapi-util"
|
|
||||||
version = "0.1.9"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
|
||||||
dependencies = [
|
|
||||||
"windows-sys",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.59.0"
|
version = "0.59.0"
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "epub2mdbook"
|
name = "epub2mdbook"
|
||||||
version = "0.7.0"
|
version = "0.8.0"
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
description = "A tool to convert EPUB files to MDBook format"
|
description = "A tool to convert EPUB files to MDBook format"
|
||||||
authors = ["Maverick Liu <maverick.liu42@gmail.com>"]
|
authors = ["Maverick Liu <maverick.liu42@gmail.com>"]
|
||||||
|
|
@ -12,6 +12,6 @@ categories = ["command-line-utilities", "text-processing"]
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.5.30", features = ["derive"] }
|
clap = { version = "4.5.30", features = ["derive"] }
|
||||||
epub = "2.1.1"
|
epub = "2.1.1"
|
||||||
html2md = "0.2.15"
|
htmd = "0.1.6"
|
||||||
regex = "1.11.1"
|
regex = "1.11.1"
|
||||||
thiserror = "2.0.11"
|
thiserror = "2.0.11"
|
||||||
|
|
|
||||||
87
src/lib.rs
87
src/lib.rs
|
|
@ -115,32 +115,29 @@ fn extract_chapters_and_resources<R: Read + Seek>(
|
||||||
let output_dir = output_dir.as_ref();
|
let output_dir = output_dir.as_ref();
|
||||||
let src_dir = output_dir.join("src");
|
let src_dir = output_dir.join("src");
|
||||||
for (_, (path, _)) in doc.resources.clone().into_iter() {
|
for (_, (path, _)) in doc.resources.clone().into_iter() {
|
||||||
let content = match doc.get_resource_by_path(&path) {
|
let mut content = match doc.get_resource_by_path(&path) {
|
||||||
Some(content) => content,
|
Some(content) => content,
|
||||||
None => continue, // unreachable
|
None => continue, // unreachable
|
||||||
};
|
};
|
||||||
if let Some(md_path) = html_to_md.get(&path) {
|
let target_path = if let Some(md_path) = html_to_md.get(&path) {
|
||||||
// html file, convert to md
|
// html file, convert to md
|
||||||
let target_path = if md_path == Path::new("SUMMARY.md") {
|
let html = String::from_utf8(content.clone())?;
|
||||||
|
let markdown = htmd::convert(&html)?;
|
||||||
|
content = post_process_md(&markdown, &file_name_map).into_bytes();
|
||||||
|
if md_path == Path::new("SUMMARY.md") {
|
||||||
src_dir.join("_SUMMARY.md")
|
src_dir.join("_SUMMARY.md")
|
||||||
} else {
|
} else {
|
||||||
src_dir.join(md_path)
|
src_dir.join(md_path)
|
||||||
};
|
|
||||||
if let Some(parent) = target_path.parent() {
|
|
||||||
fs::create_dir_all(parent)?;
|
|
||||||
}
|
}
|
||||||
let html = String::from_utf8(content)?;
|
|
||||||
let markdown = html2md::parse_html(&html);
|
|
||||||
let markdown = post_process_md(&markdown, &file_name_map);
|
|
||||||
fs::write(target_path, markdown)?;
|
|
||||||
} else {
|
} else {
|
||||||
// other file, just copy
|
// other file, just copy
|
||||||
let target_path = src_dir.join(&path);
|
src_dir.join(&path)
|
||||||
if let Some(parent) = target_path.parent() {
|
};
|
||||||
fs::create_dir_all(parent)?;
|
// write to target path
|
||||||
}
|
if let Some(parent) = target_path.parent() {
|
||||||
fs::write(target_path, content)?;
|
fs::create_dir_all(parent)?;
|
||||||
}
|
}
|
||||||
|
fs::write(target_path, content)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
@ -150,39 +147,39 @@ fn extract_chapters_and_resources<R: Read + Seek>(
|
||||||
/// [ABC]({abc.html}#xxx)
|
/// [ABC]({abc.html}#xxx)
|
||||||
/// [ABC]({abc.html})
|
/// [ABC]({abc.html})
|
||||||
/// ```
|
/// ```
|
||||||
static LINK: LazyLock<Regex> =
|
static LINK: LazyLock<Regex> = LazyLock::new(|| {
|
||||||
LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable"));
|
Regex::new(r#"\[[^\]]+\]\(?P<link>([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable")
|
||||||
static EMPTY_LINK: LazyLock<Regex> =
|
});
|
||||||
LazyLock::new(|| Regex::new(r#"\[([^\]]+)\]\(\)"#).expect("unreachable"));
|
/// Match the URL link, eg:
|
||||||
|
/// ```
|
||||||
|
/// https://www.example.com\
|
||||||
|
/// ```
|
||||||
static URL_LINK: LazyLock<Regex> =
|
static URL_LINK: LazyLock<Regex> =
|
||||||
LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").expect("unreachable"));
|
LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").expect("unreachable"));
|
||||||
fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String {
|
|
||||||
let markdown = LINK
|
|
||||||
.replace_all(markdown, |caps: &Captures| {
|
|
||||||
// replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx)
|
|
||||||
let origin = &caps[0];
|
|
||||||
let link = &caps[1];
|
|
||||||
// Don't modify links with schemes like `https`.
|
|
||||||
if URL_LINK.is_match(link) {
|
|
||||||
return origin.to_string();
|
|
||||||
}
|
|
||||||
let html_file_name = match Path::new(&link).file_name() {
|
|
||||||
Some(link) => link,
|
|
||||||
None => return origin.to_string(),
|
|
||||||
};
|
|
||||||
if let Some(md_file_name) = file_name_map.get(html_file_name) {
|
|
||||||
origin.replace(
|
|
||||||
&html_file_name.to_string_lossy().to_string(),
|
|
||||||
&md_file_name.to_string_lossy(),
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
origin.to_string()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.replace(r"![]()", "")
|
|
||||||
.replace(r"[]()", "");
|
|
||||||
|
|
||||||
EMPTY_LINK.replace_all(&markdown, "$1").to_string()
|
fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String {
|
||||||
|
LINK.replace_all(markdown, |caps: &Captures| {
|
||||||
|
// replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx)
|
||||||
|
let origin = &caps[0];
|
||||||
|
let link = &caps["link"];
|
||||||
|
// Don't modify links with schemes like `https`.
|
||||||
|
if URL_LINK.is_match(link) {
|
||||||
|
return origin.to_string();
|
||||||
|
}
|
||||||
|
let html_file_name = match Path::new(&link).file_name() {
|
||||||
|
Some(link) => link,
|
||||||
|
None => return origin.to_string(),
|
||||||
|
};
|
||||||
|
if let Some(md_file_name) = file_name_map.get(html_file_name) {
|
||||||
|
origin.replace(
|
||||||
|
&*html_file_name.to_string_lossy(),
|
||||||
|
&md_file_name.to_string_lossy(),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
origin.to_string()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_book_toml(
|
fn write_book_toml(
|
||||||
|
|
|
||||||
|
|
@ -18,4 +18,4 @@ fn main() -> Result<(), Error> {
|
||||||
convert_epub_to_mdbook(args.input_epub, args.output_dir)?;
|
convert_epub_to_mdbook(args.input_epub, args.output_dir)?;
|
||||||
println!("Conversion completed successfully!");
|
println!("Conversion completed successfully!");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue