From 5d107696375a602f42c375989c490e2af9fce161 Mon Sep 17 00:00:00 2001 From: Maverick Liu Date: Fri, 21 Feb 2025 23:35:38 +0800 Subject: [PATCH] dev --- Cargo.lock | 36 +++++++++++++------ Cargo.toml | 8 ++++- README.md | 3 +- src/error.rs | 16 +++++++++ src/lib.rs | 97 ++++++++++++++++++++++++++++++++++------------------ src/main.rs | 4 +-- 6 files changed, 115 insertions(+), 49 deletions(-) create mode 100644 src/error.rs diff --git a/Cargo.lock b/Cargo.lock index 91e6dd7..c8a8861 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -67,12 +67,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "anyhow" -version = "1.0.96" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4" - [[package]] name = "arbitrary" version = "1.4.1" @@ -213,7 +207,7 @@ checksum = "f7425903972972788e21d4a8cbec3589c232cca4fab21d320187a8e452f6465e" dependencies = [ "percent-encoding", "regex", - "thiserror", + "thiserror 1.0.69", "xml-rs", "zip", ] @@ -222,11 +216,11 @@ dependencies = [ name = "epub2mdbook" version = "0.1.0" dependencies = [ - "anyhow", "clap", "epub", "html2md", "regex", + "thiserror 2.0.11", ] [[package]] @@ -321,7 +315,7 @@ dependencies = [ "combine", "jni-sys", "log", - "thiserror", + "thiserror 1.0.69", "walkdir", ] @@ -698,7 +692,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -712,6 +715,17 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "toml_datetime" version = "0.6.8" @@ -878,5 +892,5 @@ dependencies = [ "flate2", "indexmap", "num_enum", - "thiserror", + "thiserror 1.0.69", ] diff --git a/Cargo.toml b/Cargo.toml index 295ec2b..1657ef3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,10 +2,16 @@ name = "epub2mdbook" version = "0.1.0" edition = "2021" +description = "A tool to convert EPUB files to MDBook format" +authors = ["Maverick Liu "] +license = "MIT" +repository = "https://github.com/cyborg42/epub2mdbook" +keywords = ["epub", "mdbook", "converter", "ebook"] +categories = ["command-line-utilities", "text-processing"] [dependencies] -anyhow = "1.0.96" clap = { version = "4.5.30", features = ["derive"] } epub = "2.1.1" html2md = "0.2.15" regex = "1.11.1" +thiserror = "2.0.11" diff --git a/README.md b/README.md index c9a6a3d..de7164c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,8 @@ This is a powerful tool to convert EPUB files to MDBook format. ### CLI ```bash -cargo run -- --input-epub-path path/to/input.epub --output-dir path/to/output +cargo install epub2mdbook +epub2mdbook --input-epub path/to/input.epub --output-dir path/to/output ``` ### Rust diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..5990acb --- /dev/null +++ b/src/error.rs @@ -0,0 +1,16 @@ +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum Error { + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("EPUB error: {0}")] + Epub(#[from] epub::doc::DocError), + + #[error("Invalid UTF-8: {0}")] + Utf8(#[from] std::string::FromUtf8Error), + + #[error("{0} is not a file")] + NotAFile(String), +} diff --git a/src/lib.rs b/src/lib.rs index b378a06..ce20dbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,28 @@ +pub mod error; + use epub::doc::{EpubDoc, NavPoint}; -use html2md::parse_html; +use error::Error; use regex::{Captures, Regex}; use std::collections::HashMap; -use std::fs; use std::io::{Read, Seek}; use std::path::{Path, PathBuf}; +use std::sync::LazyLock; +use std::{fs, io}; +/// Convert an EPUB file to an MDBook +/// +/// # Arguments +/// +/// * `epub_path` - The path to the EPUB file +/// * `output_dir` - The path to the output directory +/// pub fn convert_epub_to_mdbook( epub_path: impl AsRef, output_dir: Option>, -) -> anyhow::Result<()> { +) -> Result<(), Error> { let epub_path = epub_path.as_ref(); if !epub_path.is_file() { - return Err(anyhow::anyhow!("{} is not a file", epub_path.display())); + return Err(Error::NotAFile(epub_path.display().to_string())); } let book_name = epub_path.with_extension(""); let book_name = book_name.file_name().unwrap().to_string_lossy().to_string(); @@ -31,7 +41,7 @@ pub fn convert_epub_to_mdbook( }; let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned()); - let (toc, html_to_md) = toc_to_md(&doc, &title)?; + let (toc, html_to_md) = toc_to_md(&doc, &title); fs::write(output_dir.join("src/SUMMARY.md"), toc)?; extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?; @@ -39,12 +49,12 @@ pub fn convert_epub_to_mdbook( Ok(()) } -pub fn nav_point_to_md( +fn nav_to_md( nav: &NavPoint, indent: usize, - html_files: &HashMap, + html_to_md: &HashMap, ) -> Option { - let file = html_files.get(&nav.content)?; + let file = html_to_md.get(&nav.content)?; let mut md = format!( "{}- [{}]({})\n", " ".repeat(indent), @@ -52,20 +62,31 @@ pub fn nav_point_to_md( file.to_string_lossy() ); for child in &nav.children { - if let Some(child_md) = nav_point_to_md(child, indent + 1, html_files) { + if let Some(child_md) = nav_to_md(child, indent + 1, html_to_md) { md.push_str(&child_md); } } Some(md) } +/// Convert the table of contents to SUMMARY.md +/// +/// # Arguments +/// +/// * `doc` - The EPUB document +/// * `title` - The title of the book +/// +/// # Returns +/// +/// * `summary_md` - The SUMMARY.md content +/// * `html_to_md` - The file mapping from html to md pub fn toc_to_md( doc: &EpubDoc, title: &str, -) -> anyhow::Result<(String, HashMap)> { +) -> (String, HashMap) { let toc = doc.toc.clone(); - let mut markdown = format!("# {}\n\n", title); + let mut summary_md = format!("# {}\n\n", title); let html_to_md = doc .resources .iter() @@ -73,48 +94,57 @@ pub fn toc_to_md( .map(|(_, (path, _))| (path.clone(), path.with_extension("md"))) .collect::>(); for nav in toc { - if let Some(md) = nav_point_to_md(&nav, 0, &html_to_md) { - markdown.push_str(&md); + if let Some(md) = nav_to_md(&nav, 0, &html_to_md) { + summary_md.push_str(&md); } } - Ok((markdown, html_to_md)) + (summary_md, html_to_md) } -pub fn extract_chapters_and_resources( +/// Capture the `{link}` without `#`, eg: +/// ``` +/// [ABC]({abc.html}#xxx) +/// [ABC]({abc.html}) +/// ``` +static LINK_REGEX: LazyLock = + LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).unwrap()); + +fn extract_chapters_and_resources( doc: &mut EpubDoc, output_dir: impl AsRef, html_to_md: &HashMap, -) -> anyhow::Result<()> { +) -> Result<(), Error> { let output_dir = output_dir.as_ref(); let src_dir = output_dir.join("src"); - let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap(); // [abc](abc.html) for (_, (path, _)) in doc.resources.clone().into_iter() { let content = match doc.get_resource_by_path(&path) { Some(content) => content, - None => continue, + None => continue, // unreachable }; - if let Some(path) = html_to_md.get(&path) { + // html file, convert to md let target_path = src_dir.join(path); if let Some(parent) = target_path.parent() { fs::create_dir_all(parent)?; } let html = String::from_utf8(content)?; - let markdown = parse_html(&html); - let markdown = re - .replace_all(&markdown, |caps: &Captures| { - let link = caps[1].to_string(); - let ori = caps[0].to_string(); - if let Some(md_path) = html_to_md.get(&PathBuf::from(&link)) { + let markdown = LINK_REGEX + .replace_all(&html2md::parse_html(&html), |caps: &Captures| { + // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) + let origin = &caps[0]; + let link = &caps[1]; + if let Some(md_path) = html_to_md.get(&PathBuf::from(link)) { let md_path = md_path.to_string_lossy().to_string(); - ori.replace(&link, &md_path) + origin.replace(link, &md_path) } else { - ori + origin.to_string() } }) - .to_string(); + .replace(r"![]()", "") + .replace(r"[]()", ""); fs::write(target_path, markdown)?; } else { + // other file, just copy let target_path = src_dir.join(&path); if let Some(parent) = target_path.parent() { fs::create_dir_all(parent)?; @@ -125,11 +155,11 @@ pub fn extract_chapters_and_resources( Ok(()) } -pub fn write_book_toml( +fn write_book_toml( output_dir: impl AsRef, title: &str, creator: Option, -) -> anyhow::Result<()> { +) -> io::Result<()> { let output_dir = output_dir.as_ref(); let creator = match creator { Some(creator) => format!("author = \"{creator}\"\n"), @@ -145,12 +175,11 @@ mod tests { use super::*; #[test] fn test_replace_links() { - let markdown = r"[hello](hello.html)"; - let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap(); - let markdown = re.replace_all(&markdown, |caps: &Captures| { + let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)"; + let markdown = LINK_REGEX.replace_all(&markdown, |caps: &Captures| { let link = caps[1].to_string(); caps[0].replace(&link, "hello.md") }); - assert_eq!(markdown, "[hello](hello.md)"); + assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hello.md)"); } } diff --git a/src/main.rs b/src/main.rs index ec905d2..5396a2f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use clap::Parser; -use epub2mdbook::convert_epub_to_mdbook; +use epub2mdbook::{convert_epub_to_mdbook, error::Error}; #[derive(Parser)] struct Args { @@ -13,7 +13,7 @@ struct Args { output_dir: Option, } -fn main() -> anyhow::Result<()> { +fn main() -> Result<(), Error> { let args = Args::parse(); convert_epub_to_mdbook(args.input_epub, args.output_dir)?; println!("Conversion completed successfully!");