Fix EPUB chapter titles and internal links

This commit is contained in:
Maverick Liu 2026-05-15 23:09:12 +08:00
parent 0841abdd73
commit e15cb7b1a8
3 changed files with 534 additions and 189 deletions

252
Cargo.lock generated
View file

@ -19,9 +19,9 @@ dependencies = [
[[package]]
name = "anstream"
version = "0.6.18"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
dependencies = [
"anstyle",
"anstyle-parse",
@ -34,15 +34,15 @@ dependencies = [
[[package]]
name = "anstyle"
version = "1.0.10"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
dependencies = [
"utf8parse",
]
@ -108,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.54"
version = "4.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394"
checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
dependencies = [
"clap_builder",
"clap_derive",
@ -118,9 +118,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.54"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00"
checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
dependencies = [
"anstream",
"anstyle",
@ -130,9 +130,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.5.49"
version = "4.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671"
checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9"
dependencies = [
"heck",
"proc-macro2",
@ -142,9 +142,9 @@ dependencies = [
[[package]]
name = "clap_lex"
version = "0.7.4"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
[[package]]
name = "colorchoice"
@ -187,7 +187,7 @@ dependencies = [
[[package]]
name = "epub2mdbook"
version = "0.16.1"
version = "0.17.0"
dependencies = [
"clap",
"epub",
@ -195,7 +195,7 @@ dependencies = [
"mdbook-core",
"regex",
"thiserror",
"toml",
"toml 1.1.2+spec-1.1.0",
]
[[package]]
@ -221,21 +221,11 @@ dependencies = [
"miniz_oxide",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "hashbrown"
version = "0.16.1"
version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
[[package]]
name = "heck"
@ -245,31 +235,30 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "htmd"
version = "0.5.0"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60ae59466542f2346e43d4a5e9b4432a1fc915b279c9fc0484e9ed7379121454"
checksum = "7eee9b00ee2e599b4f86507157e3db786e7a3319fc225f0e9584151dbea2291d"
dependencies = [
"html5ever",
"markup5ever_rcdom",
"phf 0.13.1",
"phf",
]
[[package]]
name = "html5ever"
version = "0.35.0"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4"
checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2"
dependencies = [
"log",
"markup5ever",
"match_token",
]
[[package]]
name = "indexmap"
version = "2.12.1"
version = "2.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2"
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
dependencies = [
"equivalent",
"hashbrown",
@ -318,17 +307,11 @@ version = "0.4.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.35.0"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3"
checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862"
dependencies = [
"log",
"tendril",
@ -337,9 +320,9 @@ dependencies = [
[[package]]
name = "markup5ever_rcdom"
version = "0.35.0+unofficial"
version = "0.38.0+unofficial"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8bcd53df4748257345b8bc156d620340ce0f015ec1c7ef1cff475543888a31d"
checksum = "333171ccdf66e915257740d44e38ea5b1b19ce7b45d33cc35cb6f118fbd981ff"
dependencies = [
"html5ever",
"markup5ever",
@ -347,17 +330,6 @@ dependencies = [
"xml5ever",
]
[[package]]
name = "match_token"
version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "mdbook-core"
version = "0.5.2"
@ -368,7 +340,7 @@ dependencies = [
"regex",
"serde",
"serde_json",
"toml",
"toml 0.9.10+spec-1.1.0",
"tracing",
]
@ -429,15 +401,6 @@ version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
[[package]]
name = "phf"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
"phf_shared 0.11.3",
]
[[package]]
name = "phf"
version = "0.13.1"
@ -445,28 +408,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
dependencies = [
"phf_macros",
"phf_shared 0.13.1",
"phf_shared",
"serde",
]
[[package]]
name = "phf_codegen"
version = "0.11.3"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a"
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
]
[[package]]
name = "phf_generator"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared 0.11.3",
"rand",
"phf_generator",
"phf_shared",
]
[[package]]
@ -476,7 +429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
dependencies = [
"fastrand",
"phf_shared 0.13.1",
"phf_shared",
]
[[package]]
@ -485,22 +438,13 @@ version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
dependencies = [
"phf_generator 0.13.1",
"phf_shared 0.13.1",
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "phf_shared"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.13.1"
@ -524,37 +468,22 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.105"
version = "1.0.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7"
checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.38"
version = "1.0.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "redox_syscall"
version = "0.5.9"
@ -566,9 +495,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.12.2"
version = "1.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4"
checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
dependencies = [
"aho-corasick",
"memchr",
@ -644,9 +573,9 @@ dependencies = [
[[package]]
name = "serde_spanned"
version = "1.0.4"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
checksum = "6662b5879511e06e8999a8a235d848113e942c9124f211511b16466ee2995f26"
dependencies = [
"serde_core",
]
@ -671,25 +600,25 @@ checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd"
[[package]]
name = "string_cache"
version = "0.8.8"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "938d512196766101d333398efde81bc1f37b00cb42c2f8350e5df639f040bbbe"
checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared 0.11.3",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.4"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0"
checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
dependencies = [
"phf_generator 0.11.3",
"phf_shared 0.11.3",
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
@ -702,9 +631,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.98"
version = "2.0.117"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1"
checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
dependencies = [
"proc-macro2",
"quote",
@ -713,29 +642,28 @@ dependencies = [
[[package]]
name = "tendril"
version = "0.4.3"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24"
dependencies = [
"futf",
"mac",
"new_debug_unreachable",
"utf-8",
]
[[package]]
name = "thiserror"
version = "2.0.17"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.17"
version = "2.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
dependencies = [
"proc-macro2",
"quote",
@ -751,10 +679,25 @@ dependencies = [
"indexmap",
"serde_core",
"serde_spanned",
"toml_datetime",
"toml_datetime 0.7.5+spec-1.1.0",
"toml_parser",
"toml_writer",
"winnow",
"winnow 0.7.14",
]
[[package]]
name = "toml"
version = "1.1.2+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81f3d15e84cbcd896376e6730314d59fb5a87f31e4b038454184435cd57defee"
dependencies = [
"indexmap",
"serde_core",
"serde_spanned",
"toml_datetime 1.1.1+spec-1.1.0",
"toml_parser",
"toml_writer",
"winnow 1.0.3",
]
[[package]]
@ -767,19 +710,28 @@ dependencies = [
]
[[package]]
name = "toml_parser"
version = "1.0.6+spec-1.1.0"
name = "toml_datetime"
version = "1.1.1+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44"
checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7"
dependencies = [
"winnow",
"serde_core",
]
[[package]]
name = "toml_parser"
version = "1.1.2+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526"
dependencies = [
"winnow 1.0.3",
]
[[package]]
name = "toml_writer"
version = "1.0.6+spec-1.1.0"
version = "1.1.1+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
checksum = "756daf9b1013ebe47a8776667b466417e2d4c5679d441c26230efd9ef78692db"
[[package]]
name = "tracing"
@ -832,11 +784,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "web_atoms"
version = "0.1.3"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414"
checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538"
dependencies = [
"phf 0.11.3",
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
@ -921,6 +873,12 @@ version = "0.7.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
[[package]]
name = "winnow"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0592e1c9d151f854e6fd382574c3a0855250e1d9b2f99d9281c6e6391af352f1"
[[package]]
name = "xml"
version = "1.2.0"
@ -938,9 +896,9 @@ dependencies = [
[[package]]
name = "xml5ever"
version = "0.35.0"
version = "0.38.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494"
checksum = "d3dc9559429edf0cd3f327cc0afd9d6b36fa8cec6d93107b7fbe64f806b5f2d9"
dependencies = [
"log",
"markup5ever",

View file

@ -1,6 +1,6 @@
[package]
name = "epub2mdbook"
version = "0.16.1"
version = "0.17.0"
edition = "2024"
description = "A tool to convert EPUB files to MDBook format"
authors = ["Maverick Liu <maverick.liu42@gmail.com>"]
@ -10,10 +10,10 @@ keywords = ["epub", "mdbook", "converter", "ebook"]
categories = ["command-line-utilities", "text-processing"]
[dependencies]
clap = { version = "4.5.54", features = ["derive"] }
clap = { version = "4.6.1", features = ["derive"] }
epub = "2.1.5"
htmd = "0.5.0"
htmd = "0.5.4"
mdbook-core = "0.5.2"
regex = "1.12.2"
thiserror = "2.0.17"
toml = "0.9.10"
regex = "1.12.3"
thiserror = "2.0.18"
toml = "1.1.2"

View file

@ -2,12 +2,13 @@ pub mod error;
use epub::doc::{EpubDoc, NavPoint};
use error::Error;
use htmd::element_handler::{HandlerResult, Handlers};
use mdbook_core::config::BookConfig;
use regex::{Captures, Regex};
use std::collections::HashMap;
use std::ffi::OsStr;
use std::ffi::{OsStr, OsString};
use std::io::{Read, Seek};
use std::path::{Path, PathBuf};
use std::path::{Component, Path, PathBuf};
use std::sync::LazyLock;
use std::{fs, io};
@ -42,7 +43,8 @@ pub fn convert_epub_to_mdbook(
let mut epub_doc = EpubDoc::new(epub_path)?;
let (summary_md, html_to_md) = generate_summary_md(&epub_doc);
extract_chapters_and_resources(&mut epub_doc, &output_dir, &html_to_md)?;
let html_to_title = collect_chapter_titles(&epub_doc, &html_to_md);
extract_chapters_and_resources(&mut epub_doc, &output_dir, &html_to_md, &html_to_title)?;
fs::write(output_dir.join("src/SUMMARY.md"), summary_md)?;
write_book_toml(&epub_doc, &output_dir)?;
Ok(())
@ -53,13 +55,14 @@ fn epub_nav_to_md(
indent: usize,
html_to_md: &HashMap<PathBuf, PathBuf>,
) -> Option<String> {
let file = html_to_md.get(&nav.content)?;
let mut md = format!(
"{}- [{}]({})\n",
" ".repeat(indent),
nav.label,
file.to_string_lossy()
);
let (content_path, fragment) = split_fragment(&nav.content);
let file = html_to_md.get(&content_path)?;
let mut link = path_to_markdown_link(file);
if let Some(fragment) = fragment {
link.push('#');
link.push_str(&fragment);
}
let mut md = format!("{}- [{}]({})\n", " ".repeat(indent), nav.label, link);
for child in &nav.children {
if let Some(child_md) = epub_nav_to_md(child, indent + 1, html_to_md) {
md.push_str(&child_md);
@ -95,23 +98,48 @@ pub fn generate_summary_md<R: Read + Seek>(
})
.map(|(_, resource)| (resource.path.clone(), resource.path.with_extension("md")))
.collect::<HashMap<PathBuf, PathBuf>>();
if epub_doc.toc.is_empty() {
summary_md.push_str(&spine_to_md(epub_doc, &html_to_md));
} else {
for nav in &epub_doc.toc {
if let Some(md) = epub_nav_to_md(nav, 0, &html_to_md) {
summary_md.push_str(&md);
}
}
}
(summary_md, html_to_md)
}
fn spine_to_md<R: Read + Seek>(
epub_doc: &EpubDoc<R>,
html_to_md: &HashMap<PathBuf, PathBuf>,
) -> String {
let mut md = String::new();
for spine_item in &epub_doc.spine {
if !spine_item.linear {
continue;
}
let Some(resource) = epub_doc.resources.get(&spine_item.idref) else {
continue;
};
let Some(file) = html_to_md.get(&resource.path) else {
continue;
};
md.push_str(&format!(
"- [{}]({})\n",
path_to_title(&resource.path),
path_to_markdown_link(file)
));
}
md
}
fn extract_chapters_and_resources<R: Read + Seek>(
epub_doc: &mut EpubDoc<R>,
output_dir: impl AsRef<Path>,
html_to_md: &HashMap<PathBuf, PathBuf>,
html_to_title: &HashMap<PathBuf, String>,
) -> Result<(), Error> {
let file_name_map = html_to_md
.iter()
.filter_map(|(k, v)| Some((k.file_name()?, v.file_name()?)))
.collect::<HashMap<_, _>>();
let src_dir = output_dir.as_ref().join("src");
for (_, resource) in epub_doc.resources.clone() {
let path = &resource.path;
@ -122,8 +150,10 @@ fn extract_chapters_and_resources<R: Read + Seek>(
let target_path = if let Some(md_path) = html_to_md.get(path) {
// html file, convert to md
let html = String::from_utf8(content.clone())?;
let markdown = htmd::convert(&html)?;
content = post_process_md(&markdown, &file_name_map).into_bytes();
let markdown = convert_epub_html_to_md(&html)?;
let markdown =
add_missing_chapter_title(&markdown, html_to_title.get(path).map(String::as_str));
content = post_process_md(&markdown, path, html_to_md).into_bytes();
if md_path == Path::new("SUMMARY.md") {
src_dir.join("_SUMMARY.md")
} else {
@ -142,13 +172,246 @@ fn extract_chapters_and_resources<R: Read + Seek>(
Ok(())
}
fn collect_chapter_titles<R: Read + Seek>(
epub_doc: &EpubDoc<R>,
html_to_md: &HashMap<PathBuf, PathBuf>,
) -> HashMap<PathBuf, String> {
let mut html_to_title = HashMap::new();
for nav in &epub_doc.toc {
collect_nav_titles(nav, &mut html_to_title);
}
for spine_item in &epub_doc.spine {
let Some(resource) = epub_doc.resources.get(&spine_item.idref) else {
continue;
};
if html_to_md.contains_key(&resource.path) {
html_to_title
.entry(resource.path.clone())
.or_insert_with(|| path_to_title(&resource.path));
}
}
html_to_title
}
fn collect_nav_titles(nav: &NavPoint, html_to_title: &mut HashMap<PathBuf, String>) {
let label = nav.label.trim();
if !label.is_empty() {
let path = strip_fragment(&nav.content);
html_to_title
.entry(path)
.or_insert_with(|| label.to_string());
}
for child in &nav.children {
collect_nav_titles(child, html_to_title);
}
}
fn convert_epub_html_to_md(html: &str) -> io::Result<String> {
htmd::HtmlToMarkdown::builder()
.skip_tags(vec!["head"])
.add_handler(
vec![
"a",
"article",
"aside",
"blockquote",
"body",
"div",
"figcaption",
"figure",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"li",
"main",
"nav",
"p",
"section",
"span",
"td",
"th",
],
preserve_id_handler,
)
.build()
.convert(html)
}
fn preserve_id_handler(handlers: &dyn Handlers, element: htmd::Element) -> Option<HandlerResult> {
let id = element
.attrs
.iter()
.find(|attr| &*attr.name.local == "id")
.map(|attr| attr.value.to_string())
.filter(|id| !id.trim().is_empty());
let mut result = handlers.fallback(element)?;
if let Some(id) = id {
let content = result.content.trim_start_matches('\n');
result.content = format!("\n\n<a id=\"{}\"></a>\n\n{}", escape_attr(&id), content);
}
Some(result)
}
fn add_missing_chapter_title(markdown: &str, title: Option<&str>) -> String {
let title = match title.map(str::trim).filter(|title| !title.is_empty()) {
Some(title) => title,
None => return markdown.to_string(),
};
if starts_with_markdown_heading(markdown) {
return markdown.to_string();
}
let markdown = markdown.trim_start_matches('\n');
if markdown.is_empty() {
format!("# {title}")
} else {
format!("# {title}\n\n{markdown}")
}
}
fn starts_with_markdown_heading(markdown: &str) -> bool {
for line in markdown.lines().filter(|line| !line.trim().is_empty()) {
if is_html_anchor(line) {
continue;
}
return is_atx_heading(line);
}
false
}
fn is_atx_heading(line: &str) -> bool {
let trimmed = line.trim_start_matches(' ');
if line.len() - trimmed.len() > 3 {
return false;
}
let hashes = trimmed.bytes().take_while(|byte| *byte == b'#').count();
if !(1..=6).contains(&hashes) {
return false;
}
let rest = &trimmed[hashes..];
rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t')
}
fn is_html_anchor(line: &str) -> bool {
let trimmed = line.trim();
trimmed.starts_with("<a ")
&& trimmed.ends_with("></a>")
&& (trimmed.contains(" id=") || trimmed.contains(" name="))
}
fn strip_fragment(path: &Path) -> PathBuf {
split_fragment(path).0
}
fn split_fragment(path: &Path) -> (PathBuf, Option<String>) {
let path = path.to_string_lossy();
match path.split_once('#') {
Some((path, fragment)) => (PathBuf::from(path), Some(fragment.to_string())),
None => (PathBuf::from(path.as_ref()), None),
}
}
fn path_to_title(path: &Path) -> String {
path.file_stem()
.and_then(OsStr::to_str)
.map(|stem| stem.replace(['-', '_'], " "))
.filter(|title| !title.trim().is_empty())
.unwrap_or_else(|| path_to_markdown_link(path))
}
fn resolve_relative_path(current_file: &Path, link: &str) -> PathBuf {
let link_path = Path::new(link);
let mut resolved = if link_path.is_absolute() {
PathBuf::new()
} else {
current_file
.parent()
.unwrap_or_else(|| Path::new(""))
.to_owned()
};
for component in link_path.components() {
match component {
Component::CurDir => {}
Component::ParentDir => {
resolved.pop();
}
Component::Normal(part) => resolved.push(part),
Component::RootDir | Component::Prefix(_) => {}
}
}
resolved
}
fn relative_path(from_file: &Path, to_file: &Path) -> PathBuf {
let from_dir = from_file.parent().unwrap_or_else(|| Path::new(""));
let from = normalized_components(from_dir);
let to = normalized_components(to_file);
let common_len = from
.iter()
.zip(to.iter())
.take_while(|(left, right)| left == right)
.count();
let mut relative = PathBuf::new();
for _ in common_len..from.len() {
relative.push("..");
}
for component in &to[common_len..] {
relative.push(component);
}
relative
}
fn normalized_components(path: &Path) -> Vec<OsString> {
let mut components = Vec::new();
for component in path.components() {
match component {
Component::CurDir | Component::RootDir | Component::Prefix(_) => {}
Component::ParentDir => {
components.pop();
}
Component::Normal(part) => components.push(part.to_os_string()),
}
}
components
}
fn path_to_markdown_link(path: &Path) -> String {
let parts = path
.components()
.filter_map(|component| match component {
Component::CurDir => Some(".".to_string()),
Component::ParentDir => Some("..".to_string()),
Component::Normal(part) => Some(part.to_string_lossy().to_string()),
Component::RootDir | Component::Prefix(_) => None,
})
.collect::<Vec<_>>();
parts.join("/")
}
fn escape_attr(value: &str) -> String {
value
.replace('&', "&amp;")
.replace('"', "&quot;")
.replace('<', "&lt;")
.replace('>', "&gt;")
}
/// Capture the `{link}` without `#`, eg:
/// ```text
/// [ABC]({abc.html}#xxx)
/// [ABC]({abc.html})
/// ```
static LINK: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"\[[^\]]+\]\((?P<link>[^#)]+)(#[^)]+)?\)"#).expect("unreachable")
Regex::new(r#"\[[^\]]+\]\((?P<link>[^#)]+)(?P<fragment>#[^)]+)?\)"#).expect("unreachable")
});
/// Match the URL link, eg:
/// ```text
@ -157,7 +420,11 @@ static LINK: LazyLock<Regex> = LazyLock::new(|| {
static URL_LINK: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").expect("unreachable"));
fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> String {
fn post_process_md(
markdown: &str,
current_html_path: &Path,
html_to_md: &HashMap<PathBuf, PathBuf>,
) -> String {
LINK.replace_all(markdown, |caps: &Captures| {
// replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx)
let origin = &caps[0];
@ -166,15 +433,14 @@ fn post_process_md(markdown: &str, file_name_map: &HashMap<&OsStr, &OsStr>) -> S
if URL_LINK.is_match(link) {
return origin.to_string();
}
let html_file_name = match Path::new(&link).file_name() {
Some(link) => link,
None => return origin.to_string(),
};
if let Some(md_file_name) = file_name_map.get(html_file_name) {
origin.replace(
&*html_file_name.to_string_lossy(),
&md_file_name.to_string_lossy(),
)
let resolved_path = resolve_relative_path(current_html_path, link);
if let Some(md_path) = html_to_md.get(&resolved_path) {
let current_md_path = html_to_md
.get(current_html_path)
.cloned()
.unwrap_or_else(|| current_html_path.with_extension("md"));
let replacement = path_to_markdown_link(&relative_path(&current_md_path, md_path));
origin.replace(link, &replacement)
} else {
origin.to_string()
}
@ -196,7 +462,7 @@ fn write_book_toml<R: Read + Seek>(
.collect::<Vec<_>>();
let description = epub_doc
.mdata("description")
.map(|m| htmd::convert(&m.value).expect("unreachable"));
.and_then(|m| htmd::convert(&m.value).ok());
let lang = epub_doc
.mdata("language")
.or_else(|| epub_doc.mdata("lang"))
@ -218,10 +484,131 @@ mod tests {
#[test]
fn test_replace_links() {
let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)";
let markdown = LINK.replace_all(&markdown, |caps: &Captures| {
let link = &caps["link"];
caps[0].replace(link, "link.md")
});
assert_eq!(markdown, "[hello](link.md#xxx) [hi](link.md)");
let html_to_md = HashMap::from([
(
PathBuf::from("text/current.xhtml"),
PathBuf::from("text/current.md"),
),
(
PathBuf::from("text/hello.html"),
PathBuf::from("text/hello.md"),
),
(PathBuf::from("text/hi.xhtml"), PathBuf::from("text/hi.md")),
]);
let markdown = post_process_md(markdown, Path::new("text/current.xhtml"), &html_to_md);
assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hi.md)");
}
#[test]
fn test_replace_links_resolves_relative_paths() {
let markdown = r"[next](../part2/index.xhtml#target) [same](chapter.xhtml) [site](https://example.com/index.xhtml)";
let html_to_md = HashMap::from([
(
PathBuf::from("OPS/part1/current.xhtml"),
PathBuf::from("OPS/part1/current.md"),
),
(
PathBuf::from("OPS/part1/chapter.xhtml"),
PathBuf::from("OPS/part1/chapter.md"),
),
(
PathBuf::from("OPS/part2/index.xhtml"),
PathBuf::from("OPS/part2/index.md"),
),
]);
let markdown = post_process_md(markdown, Path::new("OPS/part1/current.xhtml"), &html_to_md);
assert_eq!(
markdown,
"[next](../part2/index.md#target) [same](chapter.md) [site](https://example.com/index.xhtml)"
);
}
#[test]
fn test_nav_fragment_is_preserved_in_summary() {
let nav = NavPoint {
label: "Section I".to_string(),
content: PathBuf::from("epub/text/chapter.xhtml#section-1"),
children: Vec::new(),
play_order: Some(1),
};
let html_to_md = HashMap::from([(
PathBuf::from("epub/text/chapter.xhtml"),
PathBuf::from("epub/text/chapter.md"),
)]);
let markdown = epub_nav_to_md(&nav, 0, &html_to_md).unwrap();
assert_eq!(markdown, "- [Section I](epub/text/chapter.md#section-1)\n");
}
#[test]
fn test_epub_html_conversion_skips_head_metadata() {
let html = r#"
<html>
<head>
<title>A Scandal in Bohemia</title>
<script>console.log("metadata");</script>
<style>body { color: red; }</style>
</head>
<body>
<article>
<h2>A Scandal in Bohemia</h2>
<p>To Sherlock Holmes she is always <em>the</em> woman.</p>
</article>
</body>
</html>
"#;
let title = "A Scandal in Bohemia".to_string();
let markdown = convert_epub_html_to_md(html).unwrap();
let markdown = add_missing_chapter_title(&markdown, Some(&title));
assert_eq!(
markdown,
"## A Scandal in Bohemia\n\nTo Sherlock Holmes she is always *the* woman."
);
}
#[test]
fn test_epub_html_conversion_preserves_ids_as_anchors() {
let html = r#"
<html>
<body>
<section id="chapter-1">
<h2>Chapter One</h2>
<p>Opening paragraph.</p>
</section>
</body>
</html>
"#;
let markdown = convert_epub_html_to_md(html).unwrap();
let markdown = add_missing_chapter_title(&markdown, Some("Chapter One"));
assert!(markdown.starts_with("<a id=\"chapter-1\"></a>\n\n## Chapter One"));
}
#[test]
fn test_missing_body_title_uses_toc_label() {
let html = r#"
<html>
<head>
<title>Head Metadata Title</title>
</head>
<body>
<p>Opening paragraph.</p>
</body>
</html>
"#;
let title = "Chapter One".to_string();
let markdown = convert_epub_html_to_md(html).unwrap();
let markdown = add_missing_chapter_title(&markdown, Some(&title));
assert_eq!(markdown, "# Chapter One\n\nOpening paragraph.");
}
}