今天是關於資料處理與分析類別最後一篇文章,寫文件轉換器,
我個人最常用的就是 Markdown格式,舉凡我的筆記到專案相關報告,我通常都會先從 markdown 格式開始著手
現在我想做一個文件轉換器,把 Markdown 轉乘 html 或 pdf 的工具,我認為對我來說解決我的痛點,
既然是我的痛點,所以我希望藉由這次的專案學習用 Rust 實現。
[dependencies]
pulldown-cmark = "0.9" # Markdown 解析器
syntect = "5.0" # 語法高亮
headless_chrome = "1.0" # PDF 生成
clap = { version = "4.4", features = ["derive"] }
anyhow = "1.0"
src/converter.rs
use pulldown_cmark::{Parser, Options, html, Event, Tag, CodeBlockKind};
use syntect::parsing::SyntaxSet;
use syntect::highlighting::{ThemeSet, Style};
use syntect::html::highlighted_html_for_string;
use anyhow::{Result, Context};
use std::fs;
use std::path::Path;
pub struct MarkdownConverter {
syntax_set: SyntaxSet,
theme_set: ThemeSet,
}
impl MarkdownConverter {
pub fn new() -> Self {
Self {
syntax_set: SyntaxSet::load_defaults_newlines(),
theme_set: ThemeSet::load_defaults(),
}
}
pub fn convert(&self, markdown: &str) -> Result<String> {
let mut options = Options::empty();
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_FOOTNOTES);
options.insert(Options::ENABLE_STRIKETHROUGH);
options.insert(Options::ENABLE_TASKLISTS);
let parser = Parser::new_ext(markdown, options);
let mut html_output = String::new();
// 處理語法高亮
let events = self.process_code_blocks(parser);
html::push_html(&mut html_output, events.into_iter());
Ok(html_output)
}
fn process_code_blocks<'a>(
&self,
parser: Parser<'a>,
) -> Vec<Event<'a>> {
let mut events = Vec::new();
let mut in_code_block = false;
let mut code_content = String::new();
let mut code_lang = String::new();
for event in parser {
match event {
Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(lang))) => {
in_code_block = true;
code_lang = lang.to_string();
code_content.clear();
}
Event::End(Tag::CodeBlock(_)) => {
if in_code_block {
let highlighted = self.highlight_code(&code_content, &code_lang);
events.push(Event::Html(highlighted.into()));
in_code_block = false;
}
}
Event::Text(text) if in_code_block => {
code_content.push_str(&text);
}
_ => events.push(event),
}
}
events
}
fn highlight_code(&self, code: &str, lang: &str) -> String {
let syntax = self.syntax_set
.find_syntax_by_token(lang)
.unwrap_or_else(|| self.syntax_set.find_syntax_plain_text());
let theme = &self.theme_set.themes["base16-ocean.dark"];
highlighted_html_for_string(code, &self.syntax_set, syntax, theme)
.unwrap_or_else(|_| format!("<pre><code>{}</code></pre>", code))
}
}
pub fn convert_to_html(
input: &Path,
output: &Path,
css: Option<&Path>,
) -> Result<()> {
let markdown = fs::read_to_string(input)
.context("無法讀取 Markdown 文件")?;
let converter = MarkdownConverter::new();
let html_content = converter.convert(&markdown)?;
let css_content = if let Some(css_path) = css {
fs::read_to_string(css_path)
.context("無法讀取 CSS 文件")?
} else {
crate::template::DEFAULT_CSS.to_string()
};
let full_html = crate::template::create_html_template(
&html_content,
&css_content,
input.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("文件")
);
fs::write(output, full_html)
.context("無法寫入 HTML 文件")?;
Ok(())
}
pub fn convert_to_pdf(
input: &Path,
output: &Path,
css: Option<&Path>,
) -> Result<()> {
// 先轉換為 HTML
let temp_html = output.with_extension("temp.html");
convert_to_html(input, &temp_html, css)?;
// 使用 headless Chrome 轉換為 PDF
crate::pdf::html_to_pdf(&temp_html, output)?;
// 清理臨時文件
fs::remove_file(temp_html)?;
Ok(())
}
pub fn batch_convert(
input_dir: &Path,
output_dir: &Path,
format: &str,
) -> Result<()> {
fs::create_dir_all(output_dir)?;
for entry in fs::read_dir(input_dir)? {
let entry = entry?;
let path = entry.path();
if path.extension().and_then(|s| s.to_str()) == Some("md") {
let output_name = path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("output");
let output_path = output_dir.join(format!(
"{}.{}",
output_name,
format
));
match format {
"html" => convert_to_html(&path, &output_path, None)?,
"pdf" => convert_to_pdf(&path, &output_path, None)?,
_ => anyhow::bail!("不支援的格式: {}", format),
}
println!(" ✓ {}", path.file_name().unwrap().to_string_lossy());
}
}
Ok(())
}
src/template.rs
這裏樣式我先給 ai 產一波
// src/template.rs
pub const DEFAULT_CSS: &str = r#"
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 0 auto;
padding: 2rem;
color: #333;
}
h1, h2, h3, h4, h5, h6 {
margin-top: 1.5em;
margin-bottom: 0.5em;
font-weight: 600;
}
h1 { font-size: 2em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
h2 { font-size: 1.5em; border-bottom: 1px solid #eaecef; padding-bottom: 0.3em; }
h3 { font-size: 1.25em; }
code {
background-color: #f6f8fa;
padding: 0.2em 0.4em;
border-radius: 3px;
font-family: 'SFMono-Regular', Consolas, 'Liberation Mono', Menlo, monospace;
font-size: 85%;
}
pre {
background-color: #f6f8fa;
padding: 16px;
overflow: auto;
border-radius: 6px;
line-height: 1.45;
}
pre code {
background-color: transparent;
padding: 0;
}
blockquote {
border-left: 4px solid #dfe2e5;
padding-left: 1em;
margin-left: 0;
color: #6a737d;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #dfe2e5;
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f6f8fa;
font-weight: 600;
}
img {
max-width: 100%;
height: auto;
}
a {
color: #0366d6;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
"#;
pub fn create_html_template(content: &str, css: &str, title: &str) -> String {
format!(
r#"<!DOCTYPE html>
<html lang="zh-TW">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{}</title>
<style>
{}
</style>
</head>
<body>
{}
</body>
</html>"#,
title, css, content
)
}
src/pdf.rs
// src/pdf.rs
use headless_chrome::{Browser, protocol::cdp::Page};
use anyhow::{Result, Context};
use std::path::Path;
pub fn html_to_pdf(html_path: &Path, output_path: &Path) -> Result<()> {
let browser = Browser::default()
.context("無法啟動 Chrome")?;
let tab = browser.new_tab()
.context("無法建立新分頁")?;
// 載入 HTML 文件
let html_url = format!("file://{}", html_path.canonicalize()?.display());
tab.navigate_to(&html_url)
.context("無法載入 HTML")?;
tab.wait_until_navigated()
.context("等待頁面載入超時")?;
// 轉換為 PDF
let pdf_options = Page::PrintToPdfOptions {
landscape: None,
display_header_footer: Some(false),
print_background: Some(true),
scale: Some(1.0),
paper_width: Some(8.27), // A4 寬度(英吋)
paper_height: Some(11.69), // A4 高度(英吋)
margin_top: Some(0.4),
margin_bottom: Some(0.4),
margin_left: Some(0.4),
margin_right: Some(0.4),
page_ranges: None,
ignore_invalid_page_ranges: None,
header_template: None,
footer_template: None,
prefer_css_page_size: None,
transfer_mode: None,
};
let pdf_data = tab.print_to_pdf(Some(pdf_options))
.context("PDF 轉換失敗")?;
std::fs::write(output_path, pdf_data)
.context("無法寫入 PDF 文件")?;
Ok(())
}
// src/main.rs
mod converter;
mod template;
mod pdf;
use clap::{Parser, Subcommand};
use anyhow::Result;
use std::path::PathBuf;
#[derive(Parser)]
#[command(name = "mdconv")]
#[command(about = "Markdown 文件轉換工具", long_about = None)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// 轉換為 HTML
Html {
/// 輸入的 Markdown 文件
#[arg(short, long)]
input: PathBuf,
/// 輸出的 HTML 文件
#[arg(short, long)]
output: PathBuf,
/// 自訂 CSS 檔案
#[arg(short, long)]
css: Option<PathBuf>,
},
/// 轉換為 PDF
Pdf {
/// 輸入的 Markdown 文件
#[arg(short, long)]
input: PathBuf,
/// 輸出的 PDF 文件
#[arg(short, long)]
output: PathBuf,
/// 自訂 CSS 檔案
#[arg(short, long)]
css: Option<PathBuf>,
},
/// 批次轉換
Batch {
/// 輸入目錄
#[arg(short, long)]
input_dir: PathBuf,
/// 輸出目錄
#[arg(short, long)]
output_dir: PathBuf,
/// 輸出格式 (html/pdf)
#[arg(short, long, default_value = "html")]
format: String,
},
}
fn main() -> Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::Html { input, output, css } => {
converter::convert_to_html(&input, &output, css.as_deref())?;
println!("✓ 成功轉換為 HTML: {}", output.display());
}
Commands::Pdf { input, output, css } => {
converter::convert_to_pdf(&input, &output, css.as_deref())?;
println!("✓ 成功轉換為 PDF: {}", output.display());
}
Commands::Batch { input_dir, output_dir, format } => {
converter::batch_convert(&input_dir, &output_dir, &format)?;
println!("✓ 批次轉換完成");
}
}
Ok(())
}
如果我們要轉單一文件的話
# 轉換為 HTML
cargo run -- html -i README.md -o output.html
# 使用自訂 CSS
cargo run -- html -i README.md -o output.html -c custom.css
# 轉換為 PDF
cargo run -- pdf -i README.md -o document.pdf
這裏我們也支援批次轉換
# 批次轉換為 HTML
cargo run -- batch -i ./docs -o ./output -f html
# 批次轉換為 PDF
cargo run -- batch -i ./docs -o ./output -f pdf