diff --git a/Cargo.lock b/Cargo.lock index ca82644..df3c2ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,9 +19,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.18" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -34,15 +34,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -108,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.54" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -118,9 +118,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.54" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -130,9 +130,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -142,9 +142,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "colorchoice" @@ -187,7 +187,7 @@ dependencies = [ [[package]] name = "epub2mdbook" -version = "0.16.1" +version = "0.17.0" dependencies = [ "clap", "epub", @@ -195,7 +195,7 @@ dependencies = [ "mdbook-core", "regex", "thiserror", - "toml", + "toml 1.1.2+spec-1.1.0", ] [[package]] @@ -221,21 +221,11 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" [[package]] name = "heck" @@ -245,31 +235,30 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "htmd" -version = "0.5.0" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60ae59466542f2346e43d4a5e9b4432a1fc915b279c9fc0484e9ed7379121454" +checksum = "7eee9b00ee2e599b4f86507157e3db786e7a3319fc225f0e9584151dbea2291d" dependencies = [ "html5ever", "markup5ever_rcdom", - "phf 0.13.1", + "phf", ] [[package]] name = "html5ever" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" +checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" dependencies = [ "log", "markup5ever", - "match_token", ] [[package]] name = "indexmap" -version = "2.12.1" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", "hashbrown", @@ -318,17 +307,11 @@ version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - [[package]] name = "markup5ever" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" +checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862" dependencies = [ "log", "tendril", @@ -337,9 +320,9 @@ dependencies = [ [[package]] name = "markup5ever_rcdom" -version = "0.35.0+unofficial" +version = "0.38.0+unofficial" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8bcd53df4748257345b8bc156d620340ce0f015ec1c7ef1cff475543888a31d" +checksum = "333171ccdf66e915257740d44e38ea5b1b19ce7b45d33cc35cb6f118fbd981ff" dependencies = [ "html5ever", "markup5ever", @@ -347,17 +330,6 @@ dependencies = [ "xml5ever", ] -[[package]] -name = "match_token" -version = "0.35.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "mdbook-core" version = "0.5.2" @@ -368,7 +340,7 @@ dependencies = [ "regex", "serde", "serde_json", - "toml", + "toml 0.9.10+spec-1.1.0", "tracing", ] @@ -429,15 +401,6 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" -[[package]] -name = "phf" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" -dependencies = [ - "phf_shared 0.11.3", -] - [[package]] name = "phf" version = "0.13.1" @@ -445,28 +408,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ "phf_macros", - "phf_shared 0.13.1", + "phf_shared", "serde", ] [[package]] name = "phf_codegen" -version = "0.11.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" dependencies = [ - "phf_generator 0.11.3", - "phf_shared 0.11.3", -] - -[[package]] -name = "phf_generator" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" -dependencies = [ - "phf_shared 0.11.3", - "rand", + "phf_generator", + "phf_shared", ] [[package]] @@ -476,7 +429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ "fastrand", - "phf_shared 0.13.1", + "phf_shared", ] [[package]] @@ -485,22 +438,13 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" dependencies = [ - "phf_generator 0.13.1", - "phf_shared 0.13.1", + "phf_generator", + "phf_shared", "proc-macro2", "quote", "syn", ] -[[package]] -name = "phf_shared" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" -dependencies = [ - "siphasher", -] - [[package]] name = "phf_shared" version = "0.13.1" @@ -524,37 +468,22 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.105" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.38" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" - [[package]] name = "redox_syscall" version = "0.5.9" @@ -566,9 +495,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -644,9 +573,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "1.0.4" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" +checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26" dependencies = [ "serde_core", ] @@ -671,25 +600,25 @@ checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "string_cache" -version = "0.8.8" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938d512196766101d333398efde81bc1f37b00cb42c2f8350e5df639f040bbbe" +checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" dependencies = [ "new_debug_unreachable", "parking_lot", - "phf_shared 0.11.3", + "phf_shared", "precomputed-hash", "serde", ] [[package]] name = "string_cache_codegen" -version = "0.5.4" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" dependencies = [ - "phf_generator 0.11.3", - "phf_shared 0.11.3", + "phf_generator", + "phf_shared", "proc-macro2", "quote", ] @@ -702,9 +631,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.98" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -713,29 +642,28 @@ dependencies = [ [[package]] name = "tendril" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24" dependencies = [ - "futf", - "mac", + "new_debug_unreachable", "utf-8", ] [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -751,10 +679,25 @@ dependencies = [ "indexmap", "serde_core", "serde_spanned", - "toml_datetime", + "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", - "winnow", + "winnow 0.7.14", +] + +[[package]] +name = "toml" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime 1.1.1+spec-1.1.0", + "toml_parser", + "toml_writer", + "winnow 1.0.3", ] [[package]] @@ -767,19 +710,28 @@ dependencies = [ ] [[package]] -name = "toml_parser" -version = "1.0.6+spec-1.1.0" +name = "toml_datetime" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ - "winnow", + "serde_core", +] + +[[package]] +name = "toml_parser" +version = "1.1.2+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" +dependencies = [ + "winnow 1.0.3", ] [[package]] name = "toml_writer" -version = "1.0.6+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" +checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db" [[package]] name = "tracing" @@ -832,11 +784,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "web_atoms" -version = "0.1.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538" dependencies = [ - "phf 0.11.3", + "phf", "phf_codegen", "string_cache", "string_cache_codegen", @@ -921,6 +873,12 @@ version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +[[package]] +name = "winnow" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1" + [[package]] name = "xml" version = "1.2.0" @@ -938,9 +896,9 @@ dependencies = [ [[package]] name = "xml5ever" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494" +checksum = "d3dc9559429edf0cd3f327cc0afd9d6b36fa8cec6d93107b7fbe64f806b5f2d9" dependencies = [ "log", "markup5ever", diff --git a/Cargo.toml b/Cargo.toml index 9f703d9..f91fbf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "epub2mdbook" -version = "0.16.1" +version = "0.17.0" edition = "2024" description = "A tool to convert EPUB files to MDBook format" authors = ["Maverick Liu "] @@ -10,10 +10,10 @@ keywords = ["epub", "mdbook", "converter", "ebook"] categories = ["command-line-utilities", "text-processing"] [dependencies] -clap = { version = "4.5.54", features = ["derive"] } +clap = { version = "4.6.1", features = ["derive"] } epub = "2.1.5" -htmd = "0.5.0" +htmd = "0.5.4" mdbook-core = "0.5.2" -regex = "1.12.2" -thiserror = "2.0.17" -toml = "0.9.10" +regex = "1.12.3" +thiserror = "2.0.18" +toml = "1.1.2" diff --git a/src/lib.rs b/src/lib.rs index a3d5c7e..6b4404d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,12 +2,13 @@ pub mod error; use epub::doc::{EpubDoc, NavPoint}; use error::Error; +use htmd::element_handler::{HandlerResult, Handlers}; use mdbook_core::config::BookConfig; use regex::{Captures, Regex}; use std::collections::HashMap; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::io::{Read, Seek}; -use std::path::{Path, PathBuf}; +use std::path::{Component, Path, PathBuf}; use std::sync::LazyLock; use std::{fs, io}; @@ -42,7 +43,8 @@ pub fn convert_epub_to_mdbook( let mut epub_doc = EpubDoc::new(epub_path)?; let (summary_md, html_to_md) = generate_summary_md(&epub_doc); - extract_chapters_and_resources(&mut epub_doc, &output_dir, &html_to_md)?; + let html_to_title = collect_chapter_titles(&epub_doc, &html_to_md); + extract_chapters_and_resources(&mut epub_doc, &output_dir, &html_to_md, &html_to_title)?; fs::write(output_dir.join("src/SUMMARY.md"), summary_md)?; write_book_toml(&epub_doc, &output_dir)?; Ok(()) @@ -53,13 +55,14 @@ fn epub_nav_to_md( indent: usize, html_to_md: &HashMap, ) -> Option { - let file = html_to_md.get(&nav.content)?; - let mut md = format!( - "{}- [{}]({})\n", - " ".repeat(indent), - nav.label, - file.to_string_lossy() - ); + let (content_path, fragment) = split_fragment(&nav.content); + let file = html_to_md.get(&content_path)?; + let mut link = path_to_markdown_link(file); + if let Some(fragment) = fragment { + link.push('#'); + link.push_str(&fragment); + } + let mut md = format!("{}- [{}]({})\n", " ".repeat(indent), nav.label, link); for child in &nav.children { if let Some(child_md) = epub_nav_to_md(child, indent + 1, html_to_md) { md.push_str(&child_md); @@ -95,23 +98,48 @@ pub fn generate_summary_md( }) .map(|(_, resource)| (resource.path.clone(), resource.path.with_extension("md"))) .collect::>(); - for nav in &epub_doc.toc { - if let Some(md) = epub_nav_to_md(nav, 0, &html_to_md) { - summary_md.push_str(&md); + if epub_doc.toc.is_empty() { + summary_md.push_str(&spine_to_md(epub_doc, &html_to_md)); + } else { + for nav in &epub_doc.toc { + if let Some(md) = epub_nav_to_md(nav, 0, &html_to_md) { + summary_md.push_str(&md); + } } } (summary_md, html_to_md) } +fn spine_to_md( + epub_doc: &EpubDoc, + html_to_md: &HashMap, +) -> String { + let mut md = String::new(); + for spine_item in &epub_doc.spine { + if !spine_item.linear { + continue; + } + let Some(resource) = epub_doc.resources.get(&spine_item.idref) else { + continue; + }; + let Some(file) = html_to_md.get(&resource.path) else { + continue; + }; + md.push_str(&format!( + "- [{}]({})\n", + path_to_title(&resource.path), + path_to_markdown_link(file) + )); + } + md +} + fn extract_chapters_and_resources( epub_doc: &mut EpubDoc, output_dir: impl AsRef, html_to_md: &HashMap, + html_to_title: &HashMap, ) -> Result<(), Error> { - let file_name_map = html_to_md - .iter() - .filter_map(|(k, v)| Some((k.file_name()?, v.file_name()?))) - .collect::>(); let src_dir = output_dir.as_ref().join("src"); for (_, resource) in epub_doc.resources.clone() { let path = &resource.path; @@ -122,8 +150,10 @@ fn extract_chapters_and_resources( let target_path = if let Some(md_path) = html_to_md.get(path) { // html file, convert to md let html = String::from_utf8(content.clone())?; - let markdown = htmd::convert(&html)?; - content = post_process_md(&markdown, &file_name_map).into_bytes(); + let markdown = convert_epub_html_to_md(&html)?; + let markdown = + add_missing_chapter_title(&markdown, html_to_title.get(path).map(String::as_str)); + content = post_process_md(&markdown, path, html_to_md).into_bytes(); if md_path == Path::new("SUMMARY.md") { src_dir.join("_SUMMARY.md") } else { @@ -142,13 +172,246 @@ fn extract_chapters_and_resources( Ok(()) } +fn collect_chapter_titles( + epub_doc: &EpubDoc, + html_to_md: &HashMap, +) -> HashMap { + let mut html_to_title = HashMap::new(); + for nav in &epub_doc.toc { + collect_nav_titles(nav, &mut html_to_title); + } + for spine_item in &epub_doc.spine { + let Some(resource) = epub_doc.resources.get(&spine_item.idref) else { + continue; + }; + if html_to_md.contains_key(&resource.path) { + html_to_title + .entry(resource.path.clone()) + .or_insert_with(|| path_to_title(&resource.path)); + } + } + html_to_title +} + +fn collect_nav_titles(nav: &NavPoint, html_to_title: &mut HashMap) { + let label = nav.label.trim(); + if !label.is_empty() { + let path = strip_fragment(&nav.content); + html_to_title + .entry(path) + .or_insert_with(|| label.to_string()); + } + + for child in &nav.children { + collect_nav_titles(child, html_to_title); + } +} + +fn convert_epub_html_to_md(html: &str) -> io::Result { + htmd::HtmlToMarkdown::builder() + .skip_tags(vec!["head"]) + .add_handler( + vec![ + "a", + "article", + "aside", + "blockquote", + "body", + "div", + "figcaption", + "figure", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "li", + "main", + "nav", + "p", + "section", + "span", + "td", + "th", + ], + preserve_id_handler, + ) + .build() + .convert(html) +} + +fn preserve_id_handler(handlers: &dyn Handlers, element: htmd::Element) -> Option { + let id = element + .attrs + .iter() + .find(|attr| &*attr.name.local == "id") + .map(|attr| attr.value.to_string()) + .filter(|id| !id.trim().is_empty()); + let mut result = handlers.fallback(element)?; + if let Some(id) = id { + let content = result.content.trim_start_matches('\n'); + result.content = format!("\n\n\n\n{}", escape_attr(&id), content); + } + Some(result) +} + +fn add_missing_chapter_title(markdown: &str, title: Option<&str>) -> String { + let title = match title.map(str::trim).filter(|title| !title.is_empty()) { + Some(title) => title, + None => return markdown.to_string(), + }; + if starts_with_markdown_heading(markdown) { + return markdown.to_string(); + } + + let markdown = markdown.trim_start_matches('\n'); + if markdown.is_empty() { + format!("# {title}") + } else { + format!("# {title}\n\n{markdown}") + } +} + +fn starts_with_markdown_heading(markdown: &str) -> bool { + for line in markdown.lines().filter(|line| !line.trim().is_empty()) { + if is_html_anchor(line) { + continue; + } + return is_atx_heading(line); + } + false +} + +fn is_atx_heading(line: &str) -> bool { + let trimmed = line.trim_start_matches(' '); + if line.len() - trimmed.len() > 3 { + return false; + } + + let hashes = trimmed.bytes().take_while(|byte| *byte == b'#').count(); + if !(1..=6).contains(&hashes) { + return false; + } + + let rest = &trimmed[hashes..]; + rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t') +} + +fn is_html_anchor(line: &str) -> bool { + let trimmed = line.trim(); + trimmed.starts_with("") + && (trimmed.contains(" id=") || trimmed.contains(" name=")) +} + +fn strip_fragment(path: &Path) -> PathBuf { + split_fragment(path).0 +} + +fn split_fragment(path: &Path) -> (PathBuf, Option) { + let path = path.to_string_lossy(); + match path.split_once('#') { + Some((path, fragment)) => (PathBuf::from(path), Some(fragment.to_string())), + None => (PathBuf::from(path.as_ref()), None), + } +} + +fn path_to_title(path: &Path) -> String { + path.file_stem() + .and_then(OsStr::to_str) + .map(|stem| stem.replace(['-', '_'], " ")) + .filter(|title| !title.trim().is_empty()) + .unwrap_or_else(|| path_to_markdown_link(path)) +} + +fn resolve_relative_path(current_file: &Path, link: &str) -> PathBuf { + let link_path = Path::new(link); + let mut resolved = if link_path.is_absolute() { + PathBuf::new() + } else { + current_file + .parent() + .unwrap_or_else(|| Path::new("")) + .to_owned() + }; + + for component in link_path.components() { + match component { + Component::CurDir => {} + Component::ParentDir => { + resolved.pop(); + } + Component::Normal(part) => resolved.push(part), + Component::RootDir | Component::Prefix(_) => {} + } + } + + resolved +} + +fn relative_path(from_file: &Path, to_file: &Path) -> PathBuf { + let from_dir = from_file.parent().unwrap_or_else(|| Path::new("")); + let from = normalized_components(from_dir); + let to = normalized_components(to_file); + let common_len = from + .iter() + .zip(to.iter()) + .take_while(|(left, right)| left == right) + .count(); + + let mut relative = PathBuf::new(); + for _ in common_len..from.len() { + relative.push(".."); + } + for component in &to[common_len..] { + relative.push(component); + } + relative +} + +fn normalized_components(path: &Path) -> Vec { + let mut components = Vec::new(); + for component in path.components() { + match component { + Component::CurDir | Component::RootDir | Component::Prefix(_) => {} + Component::ParentDir => { + components.pop(); + } + Component::Normal(part) => components.push(part.to_os_string()), + } + } + components +} + +fn path_to_markdown_link(path: &Path) -> String { + let parts = path + .components() + .filter_map(|component| match component { + Component::CurDir => Some(".".to_string()), + Component::ParentDir => Some("..".to_string()), + Component::Normal(part) => Some(part.to_string_lossy().to_string()), + Component::RootDir | Component::Prefix(_) => None, + }) + .collect::>(); + parts.join("/") +} + +fn escape_attr(value: &str) -> String { + value + .replace('&', "&") + .replace('"', """) + .replace('<', "<") + .replace('>', ">") +} + /// Capture the `{link}` without `#`, eg: /// ```text /// [ABC]({abc.html}#xxx) /// [ABC]({abc.html}) /// ``` static LINK: LazyLock = LazyLock::new(|| { - Regex::new(r#"\[[^\]]+\]\((?P[^#)]+)(#[^)]+)?\)"#).expect("unreachable") + Regex::new(r#"\[[^\]]+\]\((?P[^#)]+)(?P#[^)]+)?\)"#).expect("unreachable") }); /// Match the URL link, eg: /// ```text @@ -157,7 +420,11 @@ static LINK: LazyLock = LazyLock::new(|| { static URL_LINK: LazyLock = LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").expect("unreachable")); -fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String { +fn post_process_md( + markdown: &str, + current_html_path: &Path, + html_to_md: &HashMap, +) -> String { LINK.replace_all(markdown, |caps: &Captures| { // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) let origin = &caps[0]; @@ -166,15 +433,14 @@ fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> S if URL_LINK.is_match(link) { return origin.to_string(); } - let html_file_name = match Path::new(&link).file_name() { - Some(link) => link, - None => return origin.to_string(), - }; - if let Some(md_file_name) = file_name_map.get(html_file_name) { - origin.replace( - &*html_file_name.to_string_lossy(), - &md_file_name.to_string_lossy(), - ) + let resolved_path = resolve_relative_path(current_html_path, link); + if let Some(md_path) = html_to_md.get(&resolved_path) { + let current_md_path = html_to_md + .get(current_html_path) + .cloned() + .unwrap_or_else(|| current_html_path.with_extension("md")); + let replacement = path_to_markdown_link(&relative_path(¤t_md_path, md_path)); + origin.replace(link, &replacement) } else { origin.to_string() } @@ -196,7 +462,7 @@ fn write_book_toml( .collect::>(); let description = epub_doc .mdata("description") - .map(|m| htmd::convert(&m.value).expect("unreachable")); + .and_then(|m| htmd::convert(&m.value).ok()); let lang = epub_doc .mdata("language") .or_else(|| epub_doc.mdata("lang")) @@ -218,10 +484,131 @@ mod tests { #[test] fn test_replace_links() { let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)"; - let markdown = LINK.replace_all(&markdown, |caps: &Captures| { - let link = &caps["link"]; - caps[0].replace(link, "link.md") - }); - assert_eq!(markdown, "[hello](link.md#xxx) [hi](link.md)"); + let html_to_md = HashMap::from([ + ( + PathBuf::from("text/current.xhtml"), + PathBuf::from("text/current.md"), + ), + ( + PathBuf::from("text/hello.html"), + PathBuf::from("text/hello.md"), + ), + (PathBuf::from("text/hi.xhtml"), PathBuf::from("text/hi.md")), + ]); + + let markdown = post_process_md(markdown, Path::new("text/current.xhtml"), &html_to_md); + + assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hi.md)"); + } + + #[test] + fn test_replace_links_resolves_relative_paths() { + let markdown = r"[next](../part2/index.xhtml#target) [same](chapter.xhtml) [site](https://example.com/index.xhtml)"; + let html_to_md = HashMap::from([ + ( + PathBuf::from("OPS/part1/current.xhtml"), + PathBuf::from("OPS/part1/current.md"), + ), + ( + PathBuf::from("OPS/part1/chapter.xhtml"), + PathBuf::from("OPS/part1/chapter.md"), + ), + ( + PathBuf::from("OPS/part2/index.xhtml"), + PathBuf::from("OPS/part2/index.md"), + ), + ]); + + let markdown = post_process_md(markdown, Path::new("OPS/part1/current.xhtml"), &html_to_md); + + assert_eq!( + markdown, + "[next](../part2/index.md#target) [same](chapter.md) [site](https://example.com/index.xhtml)" + ); + } + + #[test] + fn test_nav_fragment_is_preserved_in_summary() { + let nav = NavPoint { + label: "Section I".to_string(), + content: PathBuf::from("epub/text/chapter.xhtml#section-1"), + children: Vec::new(), + play_order: Some(1), + }; + let html_to_md = HashMap::from([( + PathBuf::from("epub/text/chapter.xhtml"), + PathBuf::from("epub/text/chapter.md"), + )]); + + let markdown = epub_nav_to_md(&nav, 0, &html_to_md).unwrap(); + + assert_eq!(markdown, "- [Section I](epub/text/chapter.md#section-1)\n"); + } + + #[test] + fn test_epub_html_conversion_skips_head_metadata() { + let html = r#" + + + A Scandal in Bohemia + + + + +
+

A Scandal in Bohemia

+

To Sherlock Holmes she is always the woman.

+
+ + + "#; + let title = "A Scandal in Bohemia".to_string(); + + let markdown = convert_epub_html_to_md(html).unwrap(); + let markdown = add_missing_chapter_title(&markdown, Some(&title)); + + assert_eq!( + markdown, + "## A Scandal in Bohemia\n\nTo Sherlock Holmes she is always *the* woman." + ); + } + + #[test] + fn test_epub_html_conversion_preserves_ids_as_anchors() { + let html = r#" + + +
+

Chapter One

+

Opening paragraph.

+
+ + + "#; + + let markdown = convert_epub_html_to_md(html).unwrap(); + let markdown = add_missing_chapter_title(&markdown, Some("Chapter One")); + + assert!(markdown.starts_with("\n\n## Chapter One")); + } + + #[test] + fn test_missing_body_title_uses_toc_label() { + let html = r#" + + + Head Metadata Title + + +

Opening paragraph.

+ + + "#; + let title = "Chapter One".to_string(); + + let markdown = convert_epub_html_to_md(html).unwrap(); + let markdown = add_missing_chapter_title(&markdown, Some(&title)); + + assert_eq!(markdown, "# Chapter One\n\nOpening paragraph."); } }