fix regex

This commit is contained in:
Maverick Liu 2025-02-23 15:12:47 +08:00
parent a84969da20
commit 24a373c45a
4 changed files with 29 additions and 28 deletions

2
Cargo.lock generated
View file

@ -192,7 +192,7 @@ dependencies = [
[[package]] [[package]]
name = "epub2mdbook" name = "epub2mdbook"
version = "0.8.0" version = "0.9.0"
dependencies = [ dependencies = [
"clap", "clap",
"epub", "epub",

View file

@ -1,6 +1,6 @@
[package] [package]
name = "epub2mdbook" name = "epub2mdbook"
version = "0.8.0" version = "0.9.0"
edition = "2024" edition = "2024"
description = "A tool to convert EPUB files to MDBook format" description = "A tool to convert EPUB files to MDBook format"
authors = ["Maverick Liu <maverick.liu42@gmail.com>"] authors = ["Maverick Liu <maverick.liu42@gmail.com>"]

View file

@ -15,7 +15,7 @@ use std::{fs, io};
/// # Arguments /// # Arguments
/// ///
/// * `epub_path` - The path to the EPUB file /// * `epub_path` - The path to the EPUB file
/// * `output_dir` - The path to the output directory, pwd by default /// * `output_dir` - The path to the output directory, working directory by default
/// ///
pub fn convert_epub_to_mdbook( pub fn convert_epub_to_mdbook(
epub_path: impl AsRef<Path>, epub_path: impl AsRef<Path>,
@ -37,21 +37,24 @@ pub fn convert_epub_to_mdbook(
}; };
fs::create_dir_all(output_dir.join("src"))?; fs::create_dir_all(output_dir.join("src"))?;
let mut doc = EpubDoc::new(epub_path)?; let mut epub_doc = EpubDoc::new(epub_path)?;
let title = doc let title = epub_doc
.metadata .metadata
.get("title") .get("title")
.and_then(|v| v.first().cloned()) .and_then(|v| v.first().cloned())
.unwrap_or(book_name); .unwrap_or(book_name);
let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned()); let creator = epub_doc
let (toc, html_to_md) = toc_to_md(&doc, &title); .metadata
extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?; .get("creator")
fs::write(output_dir.join("src/SUMMARY.md"), toc)?; .and_then(|v| v.first().cloned());
let (summary_md, html_to_md) = generate_summary_md(&epub_doc, &title);
extract_chapters_and_resources(&mut epub_doc, &output_dir, &html_to_md)?;
fs::write(output_dir.join("src/SUMMARY.md"), summary_md)?;
write_book_toml(&output_dir, &title, creator)?; write_book_toml(&output_dir, &title, creator)?;
Ok(()) Ok(())
} }
fn nav_to_md( fn epub_nav_to_md(
nav: &NavPoint, nav: &NavPoint,
indent: usize, indent: usize,
html_to_md: &HashMap<PathBuf, PathBuf>, html_to_md: &HashMap<PathBuf, PathBuf>,
@ -64,14 +67,14 @@ fn nav_to_md(
file.to_string_lossy() file.to_string_lossy()
); );
for child in &nav.children { for child in &nav.children {
if let Some(child_md) = nav_to_md(child, indent + 1, html_to_md) { if let Some(child_md) = epub_nav_to_md(child, indent + 1, html_to_md) {
md.push_str(&child_md); md.push_str(&child_md);
} }
} }
Some(md) Some(md)
} }
/// Convert the table of contents to SUMMARY.md /// generate SUMMARY.md and the file mapping from html to md
/// ///
/// # Arguments /// # Arguments
/// ///
@ -82,21 +85,19 @@ fn nav_to_md(
/// ///
/// * `summary_md` - The SUMMARY.md content /// * `summary_md` - The SUMMARY.md content
/// * `html_to_md` - The file mapping from html to md /// * `html_to_md` - The file mapping from html to md
pub fn toc_to_md<R: Read + Seek>( pub fn generate_summary_md<R: Read + Seek>(
doc: &EpubDoc<R>, epub_doc: &EpubDoc<R>,
title: &str, title: &str,
) -> (String, HashMap<PathBuf, PathBuf>) { ) -> (String, HashMap<PathBuf, PathBuf>) {
let toc = doc.toc.clone();
let mut summary_md = format!("# {}\n\n", title); let mut summary_md = format!("# {}\n\n", title);
let html_to_md = doc let html_to_md = epub_doc
.resources .resources
.iter() .iter()
.filter(|(_, (_, mime))| mime == "application/xhtml+xml") .filter(|(_, (_, mime))| ["application/xhtml+xml", "text/html"].contains(&&**mime))
.map(|(_, (path, _))| (path.clone(), path.with_extension("md"))) .map(|(_, (path, _))| (path.clone(), path.with_extension("md")))
.collect::<HashMap<PathBuf, PathBuf>>(); .collect::<HashMap<PathBuf, PathBuf>>();
for nav in toc { for nav in &epub_doc.toc {
if let Some(md) = nav_to_md(&nav, 0, &html_to_md) { if let Some(md) = epub_nav_to_md(nav, 0, &html_to_md) {
summary_md.push_str(&md); summary_md.push_str(&md);
} }
} }
@ -104,7 +105,7 @@ pub fn toc_to_md<R: Read + Seek>(
} }
fn extract_chapters_and_resources<R: Read + Seek>( fn extract_chapters_and_resources<R: Read + Seek>(
doc: &mut EpubDoc<R>, epub_doc: &mut EpubDoc<R>,
output_dir: impl AsRef<Path>, output_dir: impl AsRef<Path>,
html_to_md: &HashMap<PathBuf, PathBuf>, html_to_md: &HashMap<PathBuf, PathBuf>,
) -> Result<(), Error> { ) -> Result<(), Error> {
@ -114,8 +115,8 @@ fn extract_chapters_and_resources<R: Read + Seek>(
.collect::<HashMap<_, _>>(); .collect::<HashMap<_, _>>();
let output_dir = output_dir.as_ref(); let output_dir = output_dir.as_ref();
let src_dir = output_dir.join("src"); let src_dir = output_dir.join("src");
for (_, (path, _)) in doc.resources.clone().into_iter() { for (_, (path, _)) in epub_doc.resources.clone().into_iter() {
let mut content = match doc.get_resource_by_path(&path) { let mut content = match epub_doc.get_resource_by_path(&path) {
Some(content) => content, Some(content) => content,
None => continue, // unreachable None => continue, // unreachable
}; };
@ -148,7 +149,7 @@ fn extract_chapters_and_resources<R: Read + Seek>(
/// [ABC]({abc.html}) /// [ABC]({abc.html})
/// ``` /// ```
static LINK: LazyLock<Regex> = LazyLock::new(|| { static LINK: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"\[[^\]]+\]\(?P<link>([^#)]+)(?:#[^)]+)?\)"#).expect("unreachable") Regex::new(r#"\[[^\]]+\]\((?P<link>[^#)]+)(#[^)]+)?\)"#).expect("unreachable")
}); });
/// Match the URL link, eg: /// Match the URL link, eg:
/// ``` /// ```
@ -204,9 +205,9 @@ mod tests {
fn test_replace_links() { fn test_replace_links() {
let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)"; let markdown = r"[hello](hello.html#xxx) [hi](hi.xhtml)";
let markdown = LINK.replace_all(&markdown, |caps: &Captures| { let markdown = LINK.replace_all(&markdown, |caps: &Captures| {
let link = caps[1].to_string(); let link = &caps["link"];
caps[0].replace(&link, "hello.md") caps[0].replace(link, "link.md")
}); });
assert_eq!(markdown, "[hello](hello.md#xxx) [hi](hello.md)"); assert_eq!(markdown, "[hello](link.md#xxx) [hi](link.md)");
} }
} }

View file

@ -8,7 +8,7 @@ struct Args {
/// The path to the input EPUB file /// The path to the input EPUB file
#[clap(short, long)] #[clap(short, long)]
input_epub: PathBuf, input_epub: PathBuf,
/// The path to the output directory, pwd by default /// The path to the output directory, working directory by default
#[clap(short, long)] #[clap(short, long)]
output_dir: Option<PathBuf>, output_dir: Option<PathBuf>,
} }