今天開始我們的主題,主要以資料處理和分析為主的利用 rust 去實現資料
處理相關的主題,我們今天的主題以比較常見的格式 csv 去處理相關檔案和資料
論是從資料庫匯出、API 回應,還是 Excel 轉存,我們經常需要清洗轉換和驗證 CSV 資料
我們建立一個 cli 工具達成以下目的,達成一些比較簡單的資料清洗工作
cargo new csv_cleaner
cd csv_cleaner
[dependencies]
csv = "1.3"
serde = { version = "1.0", features = ["derive"] }
anyhow = "1.0"
clap = { version = "4.5", features = ["derive"] }
src/record.rs
資料結構以及一些處理資料的方法
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Record {
#[serde(flatten)]
pub fields: HashMap<String, String>,
}
impl Record {
pub fn new() -> Self {
Self {
fields: HashMap::new(),
}
}
pub fn get(&self, key: &str) -> Option<&String> {
self.fields.get(key)
}
pub fn set(&mut self, key: String, value: String) {
self.fields.insert(key, value);
}
pub fn remove(&mut self, key: &str) -> Option<String> {
self.fields.remove(key)
}
pub fn rename_field(&mut self, old_name: &str, new_name: &str) -> bool {
if let Some(value) = self.fields.remove(old_name) {
self.fields.insert(new_name.to_string(), value);
true
} else {
false
}
}
// 清理欄位值(移除前後空白)
pub fn trim_fields(&mut self) {
for value in self.fields.values_mut() {
*value = value.trim().to_string();
}
}
// 處理空值
pub fn fill_empty(&mut self, default_value: &str) {
for value in self.fields.values_mut() {
if value.is_empty() {
*value = default_value.to_string();
}
}
}
}
src/csv_handler.rs
csv 的讀寫以及其方法
use crate::record::Record;
use anyhow::{Context, Result};
use csv::{Reader, Writer};
use std::fs::File;
use std::path::Path;
pub struct CsvHandler {
headers: Vec<String>,
records: Vec<Record>,
}
impl CsvHandler {
pub fn new() -> Self {
Self {
headers: Vec::new(),
records: Vec::new(),
}
}
pub fn read_from_file<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
let file = File::open(path.as_ref())
.context("無法開啟 CSV 檔案")?;
let mut reader = Reader::from_reader(file);
// 讀取標題
self.headers = reader
.headers()
.context("無法讀取 CSV 標題")?
.iter()
.map(|s| s.to_string())
.collect();
// 讀取所有記錄
for result in reader.records() {
let csv_record = result.context("讀取記錄時發生錯誤")?;
let mut record = Record::new();
for (i, field) in csv_record.iter().enumerate() {
if let Some(header) = self.headers.get(i) {
record.set(header.clone(), field.to_string());
}
}
self.records.push(record);
}
println!("成功讀取 {} 筆記錄", self.records.len());
Ok(())
}
pub fn write_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
let file = File::create(path.as_ref())
.context("無法建立輸出檔案")?;
let mut writer = Writer::from_writer(file);
// 寫入標題
writer.write_record(&self.headers)
.context("寫入標題時發生錯誤")?;
// 寫入所有記錄
for record in &self.records {
let row: Vec<String> = self.headers
.iter()
.map(|h| record.get(h).cloned().unwrap_or_default())
.collect();
writer.write_record(&row)
.context("寫入記錄時發生錯誤")?;
}
writer.flush().context("寫入檔案時發生錯誤")?;
println!("成功寫入 {} 筆記錄到檔案", self.records.len());
Ok(())
}
pub fn records(&self) -> &[Record] {
&self.records
}
pub fn records_mut(&mut self) -> &mut [Record] {
&mut self.records
}
pub fn headers(&self) -> &[String] {
&self.headers
}
pub fn rename_column(&mut self, old_name: &str, new_name: &str) {
// 更新標題
if let Some(pos) = self.headers.iter().position(|h| h == old_name) {
self.headers[pos] = new_name.to_string();
}
// 更新所有記錄
for record in &mut self.records {
record.rename_field(old_name, new_name);
}
}
pub fn remove_column(&mut self, column_name: &str) {
// 從標題中移除
self.headers.retain(|h| h != column_name);
// 從所有記錄中移除
for record in &mut self.records {
record.remove(column_name);
}
}
}
src/cleaner.rs
實作資料清洗功能
use crate::csv_handler::CsvHandler;
use anyhow::Result;
pub struct Cleaner;
impl Cleaner {
// 清理所有記錄的空白
pub fn trim_all(handler: &mut CsvHandler) {
for record in handler.records_mut() {
record.trim_fields();
}
println!("已清理所有欄位的前後空白");
}
// 填充空值
pub fn fill_empty_values(handler: &mut CsvHandler, default_value: &str) {
for record in handler.records_mut() {
record.fill_empty(default_value);
}
println!("已將空值填充為: {}", default_value);
}
// 移除包含空值的行
pub fn remove_rows_with_empty(handler: &mut CsvHandler, columns: &[String]) {
let original_count = handler.records().len();
handler.records_mut().retain(|record| {
columns.iter().all(|col| {
record.get(col)
.map(|v| !v.trim().is_empty())
.unwrap_or(false)
})
});
let removed = original_count - handler.records().len();
println!("移除了 {} 筆包含空值的記錄", removed);
}
// 過濾記錄
pub fn filter_records<F>(handler: &mut CsvHandler, predicate: F)
where
F: Fn(&crate::record::Record) -> bool,
{
let original_count = handler.records().len();
handler.records_mut().retain(predicate);
let removed = original_count - handler.records().len();
println!("過濾後移除了 {} 筆記錄", removed);
}
// 資料型別轉換(以數字為例)
pub fn convert_to_number(handler: &mut CsvHandler, column: &str) -> Result<()> {
for record in handler.records_mut() {
if let Some(value) = record.get(column) {
// 嘗試解析為數字並格式化
if let Ok(num) = value.trim().parse::<f64>() {
record.set(column.to_string(), num.to_string());
}
}
}
println!("已轉換 '{}' 欄位為數字格式", column);
Ok(())
}
// 移除重複記錄
pub fn remove_duplicates(handler: &mut CsvHandler, key_columns: &[String]) {
use std::collections::HashSet;
let original_count = handler.records().len();
let mut seen = HashSet::new();
handler.records_mut().retain(|record| {
let key: Vec<String> = key_columns
.iter()
.filter_map(|col| record.get(col).cloned())
.collect();
seen.insert(key)
});
let removed = original_count - handler.records().len();
println!("移除了 {} 筆重複記錄", removed);
}
}
mod record;
mod csv_handler;
mod cleaner;
use anyhow::Result;
use clap::{Parser, Subcommand};
use csv_handler::CsvHandler;
use cleaner::Cleaner;
#[derive(Parser)]
#[command(name = "csv_cleaner")]
#[command(about = "CSV 資料清洗工具", long_about = None)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
/// 清理 CSV 檔案(移除空白、填充空值)
Clean {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
/// 輸出檔案路徑
#[arg(short, long)]
output: String,
/// 是否移除前後空白
#[arg(long, default_value = "true")]
trim: bool,
/// 填充空值的預設值
#[arg(long)]
fill_empty: Option<String>,
},
/// 重命名欄位
Rename {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
/// 輸出檔案路徑
#[arg(short, long)]
output: String,
/// 舊欄位名稱
#[arg(long)]
from: String,
/// 新欄位名稱
#[arg(long)]
to: String,
},
/// 移除欄位
Remove {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
/// 輸出檔案路徑
#[arg(short, long)]
output: String,
/// 要移除的欄位名稱
#[arg(long)]
columns: Vec<String>,
},
/// 過濾記錄(移除包含空值的行)
Filter {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
/// 輸出檔案路徑
#[arg(short, long)]
output: String,
/// 檢查空值的欄位
#[arg(long)]
check_empty: Vec<String>,
},
/// 移除重複記錄
Dedupe {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
/// 輸出檔案路徑
#[arg(short, long)]
output: String,
/// 用於判斷重複的鍵欄位
#[arg(long)]
keys: Vec<String>,
},
/// 顯示 CSV 檔案資訊
Info {
/// 輸入檔案路徑
#[arg(short, long)]
input: String,
},
}
fn main() -> Result<()> {
let cli = Cli::parse();
match cli.command {
Commands::Clean { input, output, trim, fill_empty } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
if trim {
Cleaner::trim_all(&mut handler);
}
if let Some(default_value) = fill_empty {
Cleaner::fill_empty_values(&mut handler, &default_value);
}
handler.write_to_file(&output)?;
}
Commands::Rename { input, output, from, to } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
handler.rename_column(&from, &to);
handler.write_to_file(&output)?;
}
Commands::Remove { input, output, columns } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
for column in columns {
handler.remove_column(&column);
}
handler.write_to_file(&output)?;
}
Commands::Filter { input, output, check_empty } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
Cleaner::remove_rows_with_empty(&mut handler, &check_empty);
handler.write_to_file(&output)?;
}
Commands::Dedupe { input, output, keys } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
Cleaner::remove_duplicates(&mut handler, &keys);
handler.write_to_file(&output)?;
}
Commands::Info { input } => {
let mut handler = CsvHandler::new();
handler.read_from_file(&input)?;
println!("\n=== CSV 檔案資訊 ===");
println!("欄位數量: {}", handler.headers().len());
println!("記錄數量: {}", handler.records().len());
println!("\n欄位名稱:");
for (i, header) in handler.headers().iter().enumerate() {
println!(" {}. {}", i + 1, header);
}
}
}
Ok(())
}
這裡我們建立一個 sample.csv
去讀取
姓名,年齡,城市,薪資
張三, 28,台北,50000
李四,32, ,60000
王五,25,台中,
,30,高雄,55000
張三,28,台北,50000
這裡我們故意製作一些有缺漏或是空格的資料
# 清理空白並填充空值
cargo run -- clean -i sample.csv -o cleaned.csv --fill-empty "N/A"
重新命名欄位
# 將「姓名」改為「Name」
cargo run -- rename -i sample.csv -o renamed.csv --from 姓名 --to Name
移除欄位
# 移除「薪資」欄位
cargo run -- remove -i sample.csv -o removed.csv --columns 薪資
過濾紀錄
# 移除姓名或城市為空的記錄
cargo run -- filter -i sample.csv -o filtered.csv --check-empty 姓名 --check-empty 城市
移除重複
# 根據姓名和年齡移除重複記錄
cargo run -- dedupe -i sample.csv -o deduped.csv --keys 姓名 --keys 年齡
查看資訊
cargo run -- info -i sample.csv