pub mod error; use epub::doc::{EpubDoc, NavPoint}; use error::Error; use regex::{Captures, Regex}; use std::collections::HashMap; use std::io::{Read, Seek}; use std::path::{Path, PathBuf}; use std::sync::LazyLock; use std::{fs, io}; /// Convert an EPUB file to an MDBook /// /// # Arguments /// /// * `epub_path` - The path to the EPUB file /// * `output_dir` - The path to the output directory /// pub fn convert_epub_to_mdbook( epub_path: impl AsRef, output_dir: Option>, ) -> Result<(), Error> { let epub_path = epub_path.as_ref(); if !epub_path.is_file() { return Err(Error::NotAFile(epub_path.display().to_string())); } let book_name = epub_path.with_extension(""); let book_name = book_name .file_name() .expect("unreachable") .to_string_lossy() .to_string(); let output_dir = match output_dir { Some(output_dir) => output_dir.as_ref().join(&book_name), None => PathBuf::from(".").join(&book_name), }; fs::create_dir_all(output_dir.join("src"))?; let mut doc = EpubDoc::new(epub_path)?; let title = if let Some(title) = doc.metadata.get("title") { title.first().cloned().unwrap_or(book_name) } else { book_name }; let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned()); let (toc, html_to_md) = toc_to_md(&doc, &title); fs::write(output_dir.join("src/SUMMARY.md"), toc)?; extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?; write_book_toml(&output_dir, &title, creator)?; Ok(()) } fn nav_to_md( nav: &NavPoint, indent: usize, html_to_md: &HashMap, ) -> Option { let file = html_to_md.get(&nav.content)?; let mut md = format!( "{}- [{}]({})\n", " ".repeat(indent), nav.label, file.to_string_lossy() ); for child in &nav.children { if let Some(child_md) = nav_to_md(child, indent + 1, html_to_md) { md.push_str(&child_md); } } Some(md) } /// Convert the table of contents to SUMMARY.md /// /// # Arguments /// /// * `doc` - The EPUB document /// * `title` - The title of the book /// /// # Returns /// /// * `summary_md` - The SUMMARY.md content /// * `html_to_md` - The file mapping from html to md pub fn toc_to_md( doc: &EpubDoc, title: &str, ) -> (String, HashMap) { let toc = doc.toc.clone(); let mut summary_md = format!("# {}\n\n", title); let html_to_md = doc .resources .iter() .filter(|(_, (_, mime))| mime == "application/xhtml+xml") .map(|(_, (path, _))| (path.clone(), path.with_extension("md"))) .collect::>(); for nav in toc { if let Some(md) = nav_to_md(&nav, 0, &html_to_md) { summary_md.push_str(&md); } } (summary_md, html_to_md) } fn extract_chapters_and_resources( doc: &mut EpubDoc, output_dir: impl AsRef, html_to_md: &HashMap, ) -> Result<(), Error> { let output_dir = output_dir.as_ref(); let src_dir = output_dir.join("src"); for (_, (path, _)) in doc.resources.clone().into_iter() { let content = match doc.get_resource_by_path(&path) { Some(content) => content, None => continue, // unreachable }; if let Some(md_path) = html_to_md.get(&path) { // html file, convert to md let target_path = src_dir.join(md_path); if let Some(parent) = target_path.parent() { fs::create_dir_all(parent)?; } let html = String::from_utf8(content.clone())?; let markdown = html2md::parse_html(&html); let markdown = post_process_md(&markdown, html_to_md); fs::write(target_path, markdown)?; } else { // other file, just copy let target_path = src_dir.join(&path); if let Some(parent) = target_path.parent() { fs::create_dir_all(parent)?; } fs::write(target_path, content)?; } } Ok(()) } /// Capture the `{link}` without `#`, eg: /// ``` /// [ABC]({abc.html}#xxx) /// [ABC]({abc.html}) /// ``` static LINK_REGEX: LazyLock = LazyLock::new(|| Regex::new(r#"\[[^\]]+\]\(([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable")); /// Capture the empty links, eg: /// ``` /// [{ABC}]() /// ``` static LINK_NAME_REGEX: LazyLock = LazyLock::new(|| Regex::new(r#"\[([^\]]+)\]\(\)"#).expect("unreachable")); fn post_process_md(markdown: &str, html_to_md: &HashMap) -> String { let file_name_map = html_to_md .iter() .filter_map(|(k, v)| Some((k.file_name()?, v.file_name()?))) .collect::>(); let markdown = LINK_REGEX .replace_all(markdown, |caps: &Captures| { // replace [ABC](abc.html#xxx) to [ABC](abc.md#xxx) let origin = &caps[0]; let link = match Path::new(&caps[1]).file_name() { Some(link) => link, None => return origin.to_string(), }; if let Some(md_path) = file_name_map.get(link) { origin.replace( &link.to_string_lossy().to_string(), &md_path.to_string_lossy(), ) } else { origin.to_string() } }) .replace(r"![]()", "") .replace(r"[]()", ""); LINK_NAME_REGEX .replace_all(&markdown, |caps: &Captures| caps[1].to_string()) .to_string() } fn write_book_toml( output_dir: impl AsRef, title: &str, creator: Option, ) -> io::Result<()> { let output_dir = output_dir.as_ref(); let creator = match creator { Some(creator) => format!("author = \"{creator}\"\n"), None => "".to_string(), }; let toml_content = format!("[book]\ntitle = \"{title}\"\n{creator}",); fs::write(output_dir.join("book.toml"), toml_content)?; Ok(()) } #[cfg(test)] mod tests { use super::*; #[test] fn test_replace_links() { let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)"; let markdown = LINK_REGEX.replace_all(&markdown, |caps: &Captures| { let link = caps[1].to_string(); caps[0].replace(&link, "hello.md") }); assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hello.md)"); } }