RSS (Really Simple Syndication),
通常作為訂閱使用像是 blog 相關系統常用的一種格式 基本上為 XML 檔案
現在會潛藏在各種網站,雖然這年頭越來越少人人會做這樣的東西,不過我們今天把它實現
這裡我們學習如何抓取和解析 RSS Feed
cargo new rss_reader
cd rss_reader
[dependencies]
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1.0", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
chrono = { version = "0.4", features = ["serde"] }
quick-xml = "0.31"
clap = { version = "4.0", features = ["derive"] }
anyhow = "1.0"
url = "2.4"
regex = "1.10"
src/models.rs
一樣我們先建立我們的專案結構
use chrono::{DateTime, FixedOffset};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RssFeed {
pub title: String,
pub description: String,
pub link: String,
pub items: Vec<RssItem>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RssItem {
pub title: String,
pub description: Option<String>,
pub link: Option<String>,
pub pub_date: Option<DateTime<FixedOffset>>,
pub guid: Option<String>,
}
#[derive(Debug, Clone)]
pub struct FeedSource {
pub name: String,
pub url: String,
}
impl RssFeed {
pub fn new(title: String, description: String, link: String) -> Self {
Self {
title,
description,
link,
items: Vec::new(),
}
}
}
src/parser.rs
use crate::models::{RssFeed, RssItem};
use anyhow::{Context, Result};
use chrono::{DateTime, FixedOffset};
use quick_xml::events::Event;
use quick_xml::Reader;
pub struct RssParser;
impl RssParser {
pub fn parse(xml_content: &str) -> Result<RssFeed> {
let mut reader = Reader::from_str(xml_content);
let mut buf = Vec::new();
let mut feed = None;
let mut current_item = None;
let mut current_element = String::new();
let mut current_text = String::new();
let mut in_channel = false;
let mut in_item = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = std::str::from_utf8(e.name().as_ref())
.context("Invalid UTF-8 in element name")?;
match name {
"channel" => in_channel = true,
"item" => {
in_item = true;
current_item = Some(RssItem {
title: String::new(),
description: None,
link: None,
pub_date: None,
guid: None,
});
}
_ => current_element = name.to_string(),
}
}
Ok(Event::End(ref e)) => {
let name = std::str::from_utf8(e.name().as_ref())
.context("Invalid UTF-8 in element name")?;
match name {
"channel" => in_channel = false,
"item" => {
if let (Some(mut item), Some(ref mut feed)) = (current_item.take(), &mut feed) {
feed.items.push(item);
}
in_item = false;
}
_ => {
Self::process_element(¤t_element, ¤t_text, &mut feed, &mut current_item, in_channel, in_item)?;
current_text.clear();
}
}
}
Ok(Event::Text(ref e)) => {
current_text = e.unescape()?.to_string();
}
Ok(Event::CData(ref e)) => {
current_text = std::str::from_utf8(&e)?.to_string();
}
Ok(Event::Eof) => break,
Err(e) => return Err(anyhow::anyhow!("XML parsing error: {}", e)),
_ => {}
}
buf.clear();
}
feed.ok_or_else(|| anyhow::anyhow!("No valid RSS feed found"))
}
fn process_element(
element_name: &str,
text_content: &str,
feed: &mut Option<RssFeed>,
current_item: &mut Option<RssItem>,
in_channel: bool,
in_item: bool,
) -> Result<()> {
if in_item {
if let Some(item) = current_item {
match element_name {
"title" => item.title = text_content.to_string(),
"description" => item.description = Some(text_content.to_string()),
"link" => item.link = Some(text_content.to_string()),
"pubDate" => {
item.pub_date = Self::parse_date(text_content).ok();
}
"guid" => item.guid = Some(text_content.to_string()),
_ => {}
}
}
} else if in_channel {
match element_name {
"title" => {
if feed.is_none() {
*feed = Some(RssFeed::new(
text_content.to_string(),
String::new(),
String::new(),
));
} else if let Some(f) = feed {
f.title = text_content.to_string();
}
}
"description" => {
if let Some(f) = feed {
f.description = text_content.to_string();
}
}
"link" => {
if let Some(f) = feed {
f.link = text_content.to_string();
}
}
_ => {}
}
}
Ok(())
}
fn parse_date(date_str: &str) -> Result<DateTime<FixedOffset>> {
// 嘗試多種 RSS 日期格式
let formats = [
"%a, %d %b %Y %H:%M:%S %z", // RFC 2822
"%Y-%m-%dT%H:%M:%S%z", // ISO 8601
"%Y-%m-%d %H:%M:%S %z", // 變體格式
];
for format in &formats {
if let Ok(date) = DateTime::parse_from_str(date_str, format) {
return Ok(date);
}
}
Err(anyhow::anyhow!("Unable to parse date: {}", date_str))
}
}
src/fetcher.rs
use crate::models::FeedSource;
use anyhow::{Context, Result};
use reqwest::Client;
use std::time::Duration;
pub struct FeedFetcher {
client: Client,
}
impl FeedFetcher {
pub fn new() -> Result<Self> {
let client = Client::builder()
.timeout(Duration::from_secs(30))
.user_agent("Rust RSS Reader/1.0")
.build()
.context("Failed to create HTTP client")?;
Ok(Self { client })
}
pub async fn fetch_feed(&self, source: &FeedSource) -> Result<String> {
println!("正在抓取 {} 的 RSS feed...", source.name);
let response = self
.client
.get(&source.url)
.send()
.await
.context("Failed to send request")?;
if !response.status().is_success() {
return Err(anyhow::anyhow!(
"HTTP request failed with status: {}",
response.status()
));
}
let content = response
.text()
.await
.context("Failed to read response body")?;
Ok(content)
}
pub async fn fetch_multiple_feeds(&self, sources: &[FeedSource]) -> Vec<(String, Result<String>)> {
let mut results = Vec::new();
for source in sources {
let result = self.fetch_feed(source).await;
results.push((source.name.clone(), result));
}
results
}
}
src/formatter.rs
use crate::models::{RssFeed, RssItem};
use chrono::{DateTime, Local, FixedOffset};
pub struct FeedFormatter;
impl FeedFormatter {
pub fn format_feed(feed: &RssFeed) {
println!("\n{}", "=".repeat(80));
println!("📰 {}", feed.title);
println!("🔗 {}", feed.link);
println!("📝 {}", feed.description);
println!("{}", "=".repeat(80));
if feed.items.is_empty() {
println!("暫無文章");
return;
}
for (index, item) in feed.items.iter().enumerate() {
Self::format_item(item, index + 1);
}
}
fn format_item(item: &RssItem, index: usize) {
println!("\n{} 📄 {}", index, item.title);
if let Some(date) = &item.pub_date {
let local_time: DateTime<Local> = date.with_timezone(&Local);
println!(" 📅 {}", local_time.format("%Y-%m-%d %H:%M:%S"));
}
if let Some(link) = &item.link {
println!(" 🔗 {}", link);
}
if let Some(description) = &item.description {
let clean_desc = Self::clean_html(description);
let truncated = if clean_desc.len() > 200 {
format!("{}...", &clean_desc[..200])
} else {
clean_desc
};
println!(" 📖 {}", truncated);
}
println!(" {}", "-".repeat(60));
}
fn clean_html(html: &str) -> String {
// 簡單的 HTML 標籤移除
let mut result = html.to_string();
// 移除常見 HTML 標籤
let tags_to_remove = ["<p>", "</p>", "<br>", "<br/>", "<div>", "</div>",
"<span>", "</span>", "<strong>", "</strong>",
"<em>", "</em>", "<b>", "</b>", "<i>", "</i>"];
for tag in &tags_to_remove {
result = result.replace(tag, " ");
}
// 移除其他 HTML 標籤
result = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(&result, " ").to_string();
// 清理空白字符
result = result.replace("\n", " ").replace("\t", " ");
while result.contains(" ") {
result = result.replace(" ", " ");
}
result.trim().to_string()
}
}
main.rs
mod models;
mod parser;
mod fetcher;
mod formatter;
use models::FeedSource;
use parser::RssParser;
use fetcher::FeedFetcher;
use formatter::FeedFormatter;
use anyhow::Result;
use clap::{Arg, Command};
#[tokio::main]
async fn main() -> Result<()> {
let matches = Command::new("RSS Reader")
.version("1.0")
.author("Your Name")
.about("A simple RSS feed reader in Rust")
.arg(
Arg::new("urls")
.short('u')
.long("urls")
.value_name("URL")
.help("RSS feed URLs (comma separated)")
.required(false)
)
.arg(
Arg::new("config")
.short('c')
.long("config")
.value_name("FILE")
.help("Configuration file with RSS sources")
.required(false)
)
.get_matches();
let feed_sources = if let Some(urls) = matches.get_one::<String>("urls") {
parse_urls(urls)
} else {
get_default_sources()
};
let fetcher = FeedFetcher::new()?;
println!("🚀 開始抓取 {} 個 RSS feeds...", feed_sources.len());
for source in &feed_sources {
match fetcher.fetch_feed(source).await {
Ok(content) => {
match RssParser::parse(&content) {
Ok(feed) => {
FeedFormatter::format_feed(&feed);
}
Err(e) => {
eprintln!("❌ 解析 {} 的 RSS feed 失敗: {}", source.name, e);
}
}
}
Err(e) => {
eprintln!("❌ 無法抓取 {} 的 RSS feed: {}", source.name, e);
}
}
}
Ok(())
}
fn parse_urls(urls: &str) -> Vec<FeedSource> {
urls.split(',')
.enumerate()
.map(|(i, url)| FeedSource {
name: format!("Feed {}", i + 1),
url: url.trim().to_string(),
})
.collect()
}
fn get_default_sources() -> Vec<FeedSource> {
vec![
FeedSource {
name: "Rust Blog".to_string(),
url: "https://blog.rust-lang.org/feed.xml".to_string(),
},
FeedSource {
name: "This Week in Rust".to_string(),
url: "https://this-week-in-rust.org/rss.xml".to_string(),
},
FeedSource {
name: "Hacker News".to_string(),
url: "https://hnrss.org/frontpage".to_string(),
},
]
}
# 使用預設的 RSS 來源
cargo run
# 指定自定義 RSS URLs
cargo run -- -u "<RSS Feed URL>"
# 查看幫助
cargo run -- --help
OKOK nice!