This commit is contained in:
Maverick Liu 2025-02-21 23:35:38 +08:00
parent 6bcc1d9682
commit 5d10769637
6 changed files with 115 additions and 49 deletions

36
Cargo.lock generated
View file

@ -67,12 +67,6 @@ dependencies = [
"windows-sys", "windows-sys",
] ]
[[package]]
name = "anyhow"
version = "1.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4"
[[package]] [[package]]
name = "arbitrary" name = "arbitrary"
version = "1.4.1" version = "1.4.1"
@ -213,7 +207,7 @@ checksum = "f7425903972972788e21d4a8cbec3589c232cca4fab21d320187a8e452f6465e"
dependencies = [ dependencies = [
"percent-encoding", "percent-encoding",
"regex", "regex",
"thiserror", "thiserror 1.0.69",
"xml-rs", "xml-rs",
"zip", "zip",
] ]
@ -222,11 +216,11 @@ dependencies = [
name = "epub2mdbook" name = "epub2mdbook"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow",
"clap", "clap",
"epub", "epub",
"html2md", "html2md",
"regex", "regex",
"thiserror 2.0.11",
] ]
[[package]] [[package]]
@ -321,7 +315,7 @@ dependencies = [
"combine", "combine",
"jni-sys", "jni-sys",
"log", "log",
"thiserror", "thiserror 1.0.69",
"walkdir", "walkdir",
] ]
@ -698,7 +692,16 @@ version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [ dependencies = [
"thiserror-impl", "thiserror-impl 1.0.69",
]
[[package]]
name = "thiserror"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [
"thiserror-impl 2.0.11",
] ]
[[package]] [[package]]
@ -712,6 +715,17 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "thiserror-impl"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "toml_datetime" name = "toml_datetime"
version = "0.6.8" version = "0.6.8"
@ -878,5 +892,5 @@ dependencies = [
"flate2", "flate2",
"indexmap", "indexmap",
"num_enum", "num_enum",
"thiserror", "thiserror 1.0.69",
] ]

View file

@ -2,10 +2,16 @@
name = "epub2mdbook" name = "epub2mdbook"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
description = "A tool to convert EPUB files to MDBook format"
authors = ["Maverick Liu <maverickliu@gmail.com>"]
license = "MIT"
repository = "https://github.com/cyborg42/epub2mdbook"
keywords = ["epub", "mdbook", "converter", "ebook"]
categories = ["command-line-utilities", "text-processing"]
[dependencies] [dependencies]
anyhow = "1.0.96"
clap = { version = "4.5.30", features = ["derive"] } clap = { version = "4.5.30", features = ["derive"] }
epub = "2.1.1" epub = "2.1.1"
html2md = "0.2.15" html2md = "0.2.15"
regex = "1.11.1" regex = "1.11.1"
thiserror = "2.0.11"

View file

@ -7,7 +7,8 @@ This is a powerful tool to convert EPUB files to MDBook format.
### CLI ### CLI
```bash ```bash
cargo run -- --input-epub-path path/to/input.epub --output-dir path/to/output cargo install epub2mdbook
epub2mdbook --input-epub path/to/input.epub --output-dir path/to/output
``` ```
### Rust ### Rust

16
src/error.rs Normal file
View file

@ -0,0 +1,16 @@
use thiserror::Error;
#[derive(Error, Debug)]
pub enum Error {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("EPUB error: {0}")]
Epub(#[from] epub::doc::DocError),
#[error("Invalid UTF-8: {0}")]
Utf8(#[from] std::string::FromUtf8Error),
#[error("{0} is not a file")]
NotAFile(String),
}

View file

@ -1,18 +1,28 @@
pub mod error;
use epub::doc::{EpubDoc, NavPoint}; use epub::doc::{EpubDoc, NavPoint};
use html2md::parse_html; use error::Error;
use regex::{Captures, Regex}; use regex::{Captures, Regex};
use std::collections::HashMap; use std::collections::HashMap;
use std::fs;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use std::{fs, io};
/// Convert an EPUB file to an MDBook
///
/// # Arguments
///
/// * `epub_path` - The path to the EPUB file
/// * `output_dir` - The path to the output directory
///
pub fn convert_epub_to_mdbook( pub fn convert_epub_to_mdbook(
epub_path: impl AsRef<Path>, epub_path: impl AsRef<Path>,
output_dir: Option<impl AsRef<Path>>, output_dir: Option<impl AsRef<Path>>,
) -> anyhow::Result<()> { ) -> Result<(), Error> {
let epub_path = epub_path.as_ref(); let epub_path = epub_path.as_ref();
if !epub_path.is_file() { if !epub_path.is_file() {
return Err(anyhow::anyhow!("{} is not a file", epub_path.display())); return Err(Error::NotAFile(epub_path.display().to_string()));
} }
let book_name = epub_path.with_extension(""); let book_name = epub_path.with_extension("");
let book_name = book_name.file_name().unwrap().to_string_lossy().to_string(); let book_name = book_name.file_name().unwrap().to_string_lossy().to_string();
@ -31,7 +41,7 @@ pub fn convert_epub_to_mdbook(
}; };
let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned()); let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned());
let (toc, html_to_md) = toc_to_md(&doc, &title)?; let (toc, html_to_md) = toc_to_md(&doc, &title);
fs::write(output_dir.join("src/SUMMARY.md"), toc)?; fs::write(output_dir.join("src/SUMMARY.md"), toc)?;
extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?; extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?;
@ -39,12 +49,12 @@ pub fn convert_epub_to_mdbook(
Ok(()) Ok(())
} }
pub fn nav_point_to_md( fn nav_to_md(
nav: &NavPoint, nav: &NavPoint,
indent: usize, indent: usize,
html_files: &HashMap<PathBuf, PathBuf>, html_to_md: &HashMap<PathBuf, PathBuf>,
) -> Option<String> { ) -> Option<String> {
let file = html_files.get(&nav.content)?; let file = html_to_md.get(&nav.content)?;
let mut md = format!( let mut md = format!(
"{}- [{}]({})\n", "{}- [{}]({})\n",
" ".repeat(indent), " ".repeat(indent),
@ -52,20 +62,31 @@ pub fn nav_point_to_md(
file.to_string_lossy() file.to_string_lossy()
); );
for child in &nav.children { for child in &nav.children {
if let Some(child_md) = nav_point_to_md(child, indent + 1, html_files) { if let Some(child_md) = nav_to_md(child, indent + 1, html_to_md) {
md.push_str(&child_md); md.push_str(&child_md);
} }
} }
Some(md) Some(md)
} }
/// Convert the table of contents to SUMMARY.md
///
/// # Arguments
///
/// * `doc` - The EPUB document
/// * `title` - The title of the book
///
/// # Returns
///
/// * `summary_md` - The SUMMARY.md content
/// * `html_to_md` - The file mapping from html to md
pub fn toc_to_md<R: Read + Seek>( pub fn toc_to_md<R: Read + Seek>(
doc: &EpubDoc<R>, doc: &EpubDoc<R>,
title: &str, title: &str,
) -> anyhow::Result<(String, HashMap<PathBuf, PathBuf>)> { ) -> (String, HashMap<PathBuf, PathBuf>) {
let toc = doc.toc.clone(); let toc = doc.toc.clone();
let mut markdown = format!("# {}\n\n", title); let mut summary_md = format!("# {}\n\n", title);
let html_to_md = doc let html_to_md = doc
.resources .resources
.iter() .iter()
@ -73,48 +94,57 @@ pub fn toc_to_md<R: Read + Seek>(
.map(|(_, (path, _))| (path.clone(), path.with_extension("md"))) .map(|(_, (path, _))| (path.clone(), path.with_extension("md")))
.collect::<HashMap<PathBuf, PathBuf>>(); .collect::<HashMap<PathBuf, PathBuf>>();
for nav in toc { for nav in toc {
if let Some(md) = nav_point_to_md(&nav, 0, &html_to_md) { if let Some(md) = nav_to_md(&nav, 0, &html_to_md) {
markdown.push_str(&md); summary_md.push_str(&md);
} }
} }
Ok((markdown, html_to_md)) (summary_md, html_to_md)
} }
pub fn extract_chapters_and_resources<R: Read + Seek>( /// Capture the `{link}` without `#`, eg:
/// ```
/// [ABC]({abc.html}#xxx)
/// [ABC]({abc.html})
/// ```
static LINK_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).unwrap());
fn extract_chapters_and_resources<R: Read + Seek>(
doc: &mut EpubDoc<R>, doc: &mut EpubDoc<R>,
output_dir: impl AsRef<Path>, output_dir: impl AsRef<Path>,
html_to_md: &HashMap<PathBuf, PathBuf>, html_to_md: &HashMap<PathBuf, PathBuf>,
) -> anyhow::Result<()> { ) -> Result<(), Error> {
let output_dir = output_dir.as_ref(); let output_dir = output_dir.as_ref();
let src_dir = output_dir.join("src"); let src_dir = output_dir.join("src");
let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap(); // [abc](abc.html)
for (_, (path, _)) in doc.resources.clone().into_iter() { for (_, (path, _)) in doc.resources.clone().into_iter() {
let content = match doc.get_resource_by_path(&path) { let content = match doc.get_resource_by_path(&path) {
Some(content) => content, Some(content) => content,
None => continue, None => continue, // unreachable
}; };
if let Some(path) = html_to_md.get(&path) { if let Some(path) = html_to_md.get(&path) {
// html file, convert to md
let target_path = src_dir.join(path); let target_path = src_dir.join(path);
if let Some(parent) = target_path.parent() { if let Some(parent) = target_path.parent() {
fs::create_dir_all(parent)?; fs::create_dir_all(parent)?;
} }
let html = String::from_utf8(content)?; let html = String::from_utf8(content)?;
let markdown = parse_html(&html); let markdown = LINK_REGEX
let markdown = re .replace_all(&html2md::parse_html(&html), |caps: &Captures| {
.replace_all(&markdown, |caps: &Captures| { // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx)
let link = caps[1].to_string(); let origin = &caps[0];
let ori = caps[0].to_string(); let link = &caps[1];
if let Some(md_path) = html_to_md.get(&PathBuf::from(&link)) { if let Some(md_path) = html_to_md.get(&PathBuf::from(link)) {
let md_path = md_path.to_string_lossy().to_string(); let md_path = md_path.to_string_lossy().to_string();
ori.replace(&link, &md_path) origin.replace(link, &md_path)
} else { } else {
ori origin.to_string()
} }
}) })
.to_string(); .replace(r"![]()", "")
.replace(r"[]()", "");
fs::write(target_path, markdown)?; fs::write(target_path, markdown)?;
} else { } else {
// other file, just copy
let target_path = src_dir.join(&path); let target_path = src_dir.join(&path);
if let Some(parent) = target_path.parent() { if let Some(parent) = target_path.parent() {
fs::create_dir_all(parent)?; fs::create_dir_all(parent)?;
@ -125,11 +155,11 @@ pub fn extract_chapters_and_resources<R: Read + Seek>(
Ok(()) Ok(())
} }
pub fn write_book_toml( fn write_book_toml(
output_dir: impl AsRef<Path>, output_dir: impl AsRef<Path>,
title: &str, title: &str,
creator: Option<String>, creator: Option<String>,
) -> anyhow::Result<()> { ) -> io::Result<()> {
let output_dir = output_dir.as_ref(); let output_dir = output_dir.as_ref();
let creator = match creator { let creator = match creator {
Some(creator) => format!("author = \"{creator}\"\n"), Some(creator) => format!("author = \"{creator}\"\n"),
@ -145,12 +175,11 @@ mod tests {
use super::*; use super::*;
#[test] #[test]
fn test_replace_links() { fn test_replace_links() {
let markdown = r"[hello](hello.html)"; let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)";
let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap(); let markdown = LINK_REGEX.replace_all(&markdown, |caps: &Captures| {
let markdown = re.replace_all(&markdown, |caps: &Captures| {
let link = caps[1].to_string(); let link = caps[1].to_string();
caps[0].replace(&link, "hello.md") caps[0].replace(&link, "hello.md")
}); });
assert_eq!(markdown, "[hello](hello.md)"); assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hello.md)");
} }
} }

View file

@ -1,7 +1,7 @@
use std::path::PathBuf; use std::path::PathBuf;
use clap::Parser; use clap::Parser;
use epub2mdbook::convert_epub_to_mdbook; use epub2mdbook::{convert_epub_to_mdbook, error::Error};
#[derive(Parser)] #[derive(Parser)]
struct Args { struct Args {
@ -13,7 +13,7 @@ struct Args {
output_dir: Option<PathBuf>, output_dir: Option<PathBuf>,
} }
fn main() -> anyhow::Result<()> { fn main() -> Result<(), Error> {
let args = Args::parse(); let args = Args::parse();
convert_epub_to_mdbook(args.input_epub, args.output_dir)?; convert_epub_to_mdbook(args.input_epub, args.output_dir)?;
println!("Conversion completed successfully!"); println!("Conversion completed successfully!");