This commit is contained in:
Maverick Liu 2025-02-21 17:43:54 +08:00
parent 749af017ca
commit 6bcc1d9682
4 changed files with 52 additions and 19 deletions

1
Cargo.lock generated
View file

@ -226,6 +226,7 @@ dependencies = [
"clap", "clap",
"epub", "epub",
"html2md", "html2md",
"regex",
] ]
[[package]] [[package]]

View file

@ -8,3 +8,4 @@ anyhow = "1.0.96"
clap = { version = "4.5.30", features = ["derive"] } clap = { version = "4.5.30", features = ["derive"] }
epub = "2.1.1" epub = "2.1.1"
html2md = "0.2.15" html2md = "0.2.15"
regex = "1.11.1"

View file

@ -1,5 +1,6 @@
use epub::doc::{EpubDoc, NavPoint}; use epub::doc::{EpubDoc, NavPoint};
use html2md::parse_html; use html2md::parse_html;
use regex::{Captures, Regex};
use std::collections::HashMap; use std::collections::HashMap;
use std::fs; use std::fs;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
@ -9,28 +10,31 @@ pub fn convert_epub_to_mdbook(
epub_path: impl AsRef<Path>, epub_path: impl AsRef<Path>,
output_dir: Option<impl AsRef<Path>>, output_dir: Option<impl AsRef<Path>>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let book_name = epub_path.as_ref().with_extension(""); let epub_path = epub_path.as_ref();
let book_name = book_name.file_name().unwrap().to_str().unwrap(); if !epub_path.is_file() {
let pwd = PathBuf::from("."); return Err(anyhow::anyhow!("{} is not a file", epub_path.display()));
}
let book_name = epub_path.with_extension("");
let book_name = book_name.file_name().unwrap().to_string_lossy().to_string();
let output_dir = match output_dir { let output_dir = match output_dir {
Some(output_dir) => output_dir.as_ref().join(book_name), Some(output_dir) => output_dir.as_ref().join(&book_name),
None => pwd.join(book_name), None => PathBuf::from(".").join(&book_name),
}; };
fs::create_dir_all(output_dir.join("src"))?; fs::create_dir_all(output_dir.join("src"))?;
let mut doc = EpubDoc::new(&epub_path)?; let mut doc = EpubDoc::new(epub_path)?;
let title = if let Some(title) = doc.metadata.get("title") { let title = if let Some(title) = doc.metadata.get("title") {
title.first().unwrap_or(&book_name.to_string()).clone() title.first().cloned().unwrap_or(book_name)
} else { } else {
book_name.to_string() book_name
}; };
let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned()); let creator = doc.metadata.get("creator").and_then(|v| v.first().cloned());
let (toc, html_files) = toc_to_md(&doc, &title)?; let (toc, html_to_md) = toc_to_md(&doc, &title)?;
fs::write(output_dir.join("src/SUMMARY.md"), toc)?; fs::write(output_dir.join("src/SUMMARY.md"), toc)?;
extract_chapters_and_resources(&mut doc, &output_dir, &html_files)?; extract_chapters_and_resources(&mut doc, &output_dir, &html_to_md)?;
write_book_toml(&output_dir, &title, creator)?; write_book_toml(&output_dir, &title, creator)?;
Ok(()) Ok(())
} }
@ -45,7 +49,7 @@ pub fn nav_point_to_md(
"{}- [{}]({})\n", "{}- [{}]({})\n",
" ".repeat(indent), " ".repeat(indent),
nav.label, nav.label,
file.to_str()? file.to_string_lossy()
); );
for child in &nav.children { for child in &nav.children {
if let Some(child_md) = nav_point_to_md(child, indent + 1, html_files) { if let Some(child_md) = nav_point_to_md(child, indent + 1, html_files) {
@ -62,41 +66,53 @@ pub fn toc_to_md<R: Read + Seek>(
let toc = doc.toc.clone(); let toc = doc.toc.clone();
let mut markdown = format!("# {}\n\n", title); let mut markdown = format!("# {}\n\n", title);
let html_files = doc let html_to_md = doc
.resources .resources
.iter() .iter()
.filter(|(_, (_, mime))| mime == "application/xhtml+xml") .filter(|(_, (_, mime))| mime == "application/xhtml+xml")
.map(|(_, (path, _))| (path.clone(), path.with_extension("md"))) .map(|(_, (path, _))| (path.clone(), path.with_extension("md")))
.collect::<HashMap<PathBuf, PathBuf>>(); .collect::<HashMap<PathBuf, PathBuf>>();
for nav in toc { for nav in toc {
if let Some(md) = nav_point_to_md(&nav, 0, &html_files) { if let Some(md) = nav_point_to_md(&nav, 0, &html_to_md) {
markdown.push_str(&md); markdown.push_str(&md);
} }
} }
Ok((markdown, html_files)) Ok((markdown, html_to_md))
} }
pub fn extract_chapters_and_resources<R: Read + Seek>( pub fn extract_chapters_and_resources<R: Read + Seek>(
doc: &mut EpubDoc<R>, doc: &mut EpubDoc<R>,
output_dir: impl AsRef<Path>, output_dir: impl AsRef<Path>,
html_files: &HashMap<PathBuf, PathBuf>, html_to_md: &HashMap<PathBuf, PathBuf>,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let output_dir = output_dir.as_ref(); let output_dir = output_dir.as_ref();
let src_dir = output_dir.join("src"); let src_dir = output_dir.join("src");
let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap(); // [abc](abc.html)
for (_, (path, _)) in doc.resources.clone().into_iter() { for (_, (path, _)) in doc.resources.clone().into_iter() {
let content = match doc.get_resource_by_path(&path) { let content = match doc.get_resource_by_path(&path) {
Some(content) => content, Some(content) => content,
None => continue, None => continue,
}; };
if let Some(path) = html_files.get(&path) { if let Some(path) = html_to_md.get(&path) {
let target_path = src_dir.join(path); let target_path = src_dir.join(path);
if let Some(parent) = target_path.parent() { if let Some(parent) = target_path.parent() {
fs::create_dir_all(parent)?; fs::create_dir_all(parent)?;
} }
let html = String::from_utf8(content)?; let html = String::from_utf8(content)?;
let markdown = parse_html(&html); let markdown = parse_html(&html);
let markdown = re
.replace_all(&markdown, |caps: &Captures| {
let link = caps[1].to_string();
let ori = caps[0].to_string();
if let Some(md_path) = html_to_md.get(&PathBuf::from(&link)) {
let md_path = md_path.to_string_lossy().to_string();
ori.replace(&link, &md_path)
} else {
ori
}
})
.to_string();
fs::write(target_path, markdown)?; fs::write(target_path, markdown)?;
} else { } else {
let target_path = src_dir.join(&path); let target_path = src_dir.join(&path);
@ -123,3 +139,18 @@ pub fn write_book_toml(
fs::write(output_dir.join("book.toml"), toml_content)?; fs::write(output_dir.join("book.toml"), toml_content)?;
Ok(()) Ok(())
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_replace_links() {
let markdown = r"[hello](hello.html)";
let re = Regex::new(r#"\[[^\]]+\]\(([^)]+)\)"#).unwrap();
let markdown = re.replace_all(&markdown, |caps: &Captures| {
let link = caps[1].to_string();
caps[0].replace(&link, "hello.md")
});
assert_eq!(markdown, "[hello](hello.md)");
}
}

View file

@ -7,7 +7,7 @@ use epub2mdbook::convert_epub_to_mdbook;
struct Args { struct Args {
/// The path to the input EPUB file /// The path to the input EPUB file
#[clap(short, long)] #[clap(short, long)]
input_epub_path: PathBuf, input_epub: PathBuf,
/// The path to the output directory /// The path to the output directory
#[clap(short, long)] #[clap(short, long)]
output_dir: Option<PathBuf>, output_dir: Option<PathBuf>,
@ -15,7 +15,7 @@ struct Args {
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
let args = Args::parse(); let args = Args::parse();
convert_epub_to_mdbook(args.input_epub_path, args.output_dir)?; convert_epub_to_mdbook(args.input_epub, args.output_dir)?;
println!("Conversion completed successfully!"); println!("Conversion completed successfully!");
Ok(()) Ok(())
} }