iT邦幫忙

2025 iThome 鐵人賽

DAY 12
0
Rust

Rust 實戰專案集:30 個漸進式專案從工具到服務系列 第 12

RSS 訂閱閱讀器 - 抓取並解析 RSS feeds

  • 分享至 

  • xImage
  •  

前言

RSS (Really Simple Syndication),
通常作為訂閱使用像是 blog 相關系統常用的一種格式 基本上為 XML 檔案
現在會潛藏在各種網站,雖然這年頭越來越少人人會做這樣的東西,不過我們今天把它實現

這裡我們學習如何抓取和解析 RSS Feed

專案目標

  • 抓取遠端 RSS feeds
  • 解析 RSS XML 格式
  • 顯示文章標題、描述和發布時間
  • 支援多個 RSS 來源
  • 錯誤處理和重試機制

讓我們開始吧

cargo new rss_reader
cd rss_reader

依賴

[dependencies]
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1.0", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
chrono = { version = "0.4", features = ["serde"] }
quick-xml = "0.31"
clap = { version = "4.0", features = ["derive"] }
anyhow = "1.0"
url = "2.4"
regex = "1.10"

開始撰寫程式碼

src/models.rs 一樣我們先建立我們的專案結構

use chrono::{DateTime, FixedOffset};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RssFeed {
    pub title: String,
    pub description: String,
    pub link: String,
    pub items: Vec<RssItem>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RssItem {
    pub title: String,
    pub description: Option<String>,
    pub link: Option<String>,
    pub pub_date: Option<DateTime<FixedOffset>>,
    pub guid: Option<String>,
}

#[derive(Debug, Clone)]
pub struct FeedSource {
    pub name: String,
    pub url: String,
}

impl RssFeed {
    pub fn new(title: String, description: String, link: String) -> Self {
        Self {
            title,
            description,
            link,
            items: Vec::new(),
        }
    }
}

製作解析器 - 處理 RSS XML 解析

src/parser.rs

use crate::models::{RssFeed, RssItem};
use anyhow::{Context, Result};
use chrono::{DateTime, FixedOffset};
use quick_xml::events::Event;
use quick_xml::Reader;

pub struct RssParser;

impl RssParser {
    pub fn parse(xml_content: &str) -> Result<RssFeed> {
        let mut reader = Reader::from_str(xml_content);
        let mut buf = Vec::new();
        
        let mut feed = None;
        let mut current_item = None;
        let mut current_element = String::new();
        let mut current_text = String::new();
        let mut in_channel = false;
        let mut in_item = false;
        
        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(ref e)) => {
                    let name = std::str::from_utf8(e.name().as_ref())
                        .context("Invalid UTF-8 in element name")?;
                    
                    match name {
                        "channel" => in_channel = true,
                        "item" => {
                            in_item = true;
                            current_item = Some(RssItem {
                                title: String::new(),
                                description: None,
                                link: None,
                                pub_date: None,
                                guid: None,
                            });
                        }
                        _ => current_element = name.to_string(),
                    }
                }
                Ok(Event::End(ref e)) => {
                    let name = std::str::from_utf8(e.name().as_ref())
                        .context("Invalid UTF-8 in element name")?;
                    
                    match name {
                        "channel" => in_channel = false,
                        "item" => {
                            if let (Some(mut item), Some(ref mut feed)) = (current_item.take(), &mut feed) {
                                feed.items.push(item);
                            }
                            in_item = false;
                        }
                        _ => {
                            Self::process_element(&current_element, &current_text, &mut feed, &mut current_item, in_channel, in_item)?;
                            current_text.clear();
                        }
                    }
                }
                Ok(Event::Text(ref e)) => {
                    current_text = e.unescape()?.to_string();
                }
                Ok(Event::CData(ref e)) => {
                    current_text = std::str::from_utf8(&e)?.to_string();
                }
                Ok(Event::Eof) => break,
                Err(e) => return Err(anyhow::anyhow!("XML parsing error: {}", e)),
                _ => {}
            }
            buf.clear();
        }
        
        feed.ok_or_else(|| anyhow::anyhow!("No valid RSS feed found"))
    }
    
    fn process_element(
        element_name: &str,
        text_content: &str,
        feed: &mut Option<RssFeed>,
        current_item: &mut Option<RssItem>,
        in_channel: bool,
        in_item: bool,
    ) -> Result<()> {
        if in_item {
            if let Some(item) = current_item {
                match element_name {
                    "title" => item.title = text_content.to_string(),
                    "description" => item.description = Some(text_content.to_string()),
                    "link" => item.link = Some(text_content.to_string()),
                    "pubDate" => {
                        item.pub_date = Self::parse_date(text_content).ok();
                    }
                    "guid" => item.guid = Some(text_content.to_string()),
                    _ => {}
                }
            }
        } else if in_channel {
            match element_name {
                "title" => {
                    if feed.is_none() {
                        *feed = Some(RssFeed::new(
                            text_content.to_string(),
                            String::new(),
                            String::new(),
                        ));
                    } else if let Some(f) = feed {
                        f.title = text_content.to_string();
                    }
                }
                "description" => {
                    if let Some(f) = feed {
                        f.description = text_content.to_string();
                    }
                }
                "link" => {
                    if let Some(f) = feed {
                        f.link = text_content.to_string();
                    }
                }
                _ => {}
            }
        }
        
        Ok(())
    }
    
    fn parse_date(date_str: &str) -> Result<DateTime<FixedOffset>> {
        // 嘗試多種 RSS 日期格式
        let formats = [
            "%a, %d %b %Y %H:%M:%S %z",      // RFC 2822
            "%Y-%m-%dT%H:%M:%S%z",           // ISO 8601
            "%Y-%m-%d %H:%M:%S %z",          // 變體格式
        ];
        
        for format in &formats {
            if let Ok(date) = DateTime::parse_from_str(date_str, format) {
                return Ok(date);
            }
        }
        
        Err(anyhow::anyhow!("Unable to parse date: {}", date_str))
    }
}

這裡我們製作 fetcher 去抓取 client端資料

src/fetcher.rs

use crate::models::FeedSource;
use anyhow::{Context, Result};
use reqwest::Client;
use std::time::Duration;

pub struct FeedFetcher {
    client: Client,
}

impl FeedFetcher {
    pub fn new() -> Result<Self> {
        let client = Client::builder()
            .timeout(Duration::from_secs(30))
            .user_agent("Rust RSS Reader/1.0")
            .build()
            .context("Failed to create HTTP client")?;
            
        Ok(Self { client })
    }
    
    pub async fn fetch_feed(&self, source: &FeedSource) -> Result<String> {
        println!("正在抓取 {} 的 RSS feed...", source.name);
        
        let response = self
            .client
            .get(&source.url)
            .send()
            .await
            .context("Failed to send request")?;
            
        if !response.status().is_success() {
            return Err(anyhow::anyhow!(
                "HTTP request failed with status: {}",
                response.status()
            ));
        }
        
        let content = response
            .text()
            .await
            .context("Failed to read response body")?;
            
        Ok(content)
    }
    
    pub async fn fetch_multiple_feeds(&self, sources: &[FeedSource]) -> Vec<(String, Result<String>)> {
        let mut results = Vec::new();
        
        for source in sources {
            let result = self.fetch_feed(source).await;
            results.push((source.name.clone(), result));
        }
        
        results
    }
}

格式化輸出

src/formatter.rs

use crate::models::{RssFeed, RssItem};
use chrono::{DateTime, Local, FixedOffset};

pub struct FeedFormatter;

impl FeedFormatter {
    pub fn format_feed(feed: &RssFeed) {
        println!("\n{}", "=".repeat(80));
        println!("📰 {}", feed.title);
        println!("🔗 {}", feed.link);
        println!("📝 {}", feed.description);
        println!("{}", "=".repeat(80));
        
        if feed.items.is_empty() {
            println!("暫無文章");
            return;
        }
        
        for (index, item) in feed.items.iter().enumerate() {
            Self::format_item(item, index + 1);
        }
    }
    
    fn format_item(item: &RssItem, index: usize) {
        println!("\n{} 📄 {}", index, item.title);
        
        if let Some(date) = &item.pub_date {
            let local_time: DateTime<Local> = date.with_timezone(&Local);
            println!("   📅 {}", local_time.format("%Y-%m-%d %H:%M:%S"));
        }
        
        if let Some(link) = &item.link {
            println!("   🔗 {}", link);
        }
        
        if let Some(description) = &item.description {
            let clean_desc = Self::clean_html(description);
            let truncated = if clean_desc.len() > 200 {
                format!("{}...", &clean_desc[..200])
            } else {
                clean_desc
            };
            println!("   📖 {}", truncated);
        }
        
        println!("   {}", "-".repeat(60));
    }
    
    fn clean_html(html: &str) -> String {
        // 簡單的 HTML 標籤移除
        let mut result = html.to_string();
        
        // 移除常見 HTML 標籤
        let tags_to_remove = ["<p>", "</p>", "<br>", "<br/>", "<div>", "</div>", 
                             "<span>", "</span>", "<strong>", "</strong>", 
                             "<em>", "</em>", "<b>", "</b>", "<i>", "</i>"];
        
        for tag in &tags_to_remove {
            result = result.replace(tag, " ");
        }
        
        // 移除其他 HTML 標籤
        result = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(&result, " ").to_string();
        
        // 清理空白字符
        result = result.replace("\n", " ").replace("\t", " ");
        while result.contains("  ") {
            result = result.replace("  ", " ");
        }
        
        result.trim().to_string()
    }
}

最後,我們的 main.rs

mod models;
mod parser;
mod fetcher;
mod formatter;

use models::FeedSource;
use parser::RssParser;
use fetcher::FeedFetcher;
use formatter::FeedFormatter;

use anyhow::Result;
use clap::{Arg, Command};

#[tokio::main]
async fn main() -> Result<()> {
    let matches = Command::new("RSS Reader")
        .version("1.0")
        .author("Your Name")
        .about("A simple RSS feed reader in Rust")
        .arg(
            Arg::new("urls")
                .short('u')
                .long("urls")
                .value_name("URL")
                .help("RSS feed URLs (comma separated)")
                .required(false)
        )
        .arg(
            Arg::new("config")
                .short('c')
                .long("config")
                .value_name("FILE")
                .help("Configuration file with RSS sources")
                .required(false)
        )
        .get_matches();

    let feed_sources = if let Some(urls) = matches.get_one::<String>("urls") {
        parse_urls(urls)
    } else {
        get_default_sources()
    };

    let fetcher = FeedFetcher::new()?;
    
    println!("🚀 開始抓取 {} 個 RSS feeds...", feed_sources.len());
    
    for source in &feed_sources {
        match fetcher.fetch_feed(source).await {
            Ok(content) => {
                match RssParser::parse(&content) {
                    Ok(feed) => {
                        FeedFormatter::format_feed(&feed);
                    }
                    Err(e) => {
                        eprintln!("❌ 解析 {} 的 RSS feed 失敗: {}", source.name, e);
                    }
                }
            }
            Err(e) => {
                eprintln!("❌ 無法抓取 {} 的 RSS feed: {}", source.name, e);
            }
        }
    }

    Ok(())
}

fn parse_urls(urls: &str) -> Vec<FeedSource> {
    urls.split(',')
        .enumerate()
        .map(|(i, url)| FeedSource {
            name: format!("Feed {}", i + 1),
            url: url.trim().to_string(),
        })
        .collect()
}

fn get_default_sources() -> Vec<FeedSource> {
    vec![
        FeedSource {
            name: "Rust Blog".to_string(),
            url: "https://blog.rust-lang.org/feed.xml".to_string(),
        },
        FeedSource {
            name: "This Week in Rust".to_string(),
            url: "https://this-week-in-rust.org/rss.xml".to_string(),
        },
        FeedSource {
            name: "Hacker News".to_string(),
            url: "https://hnrss.org/frontpage".to_string(),
        },
    ]
}

開始使用

# 使用預設的 RSS 來源
cargo run

# 指定自定義 RSS URLs
cargo run -- -u "<RSS Feed URL>"

# 查看幫助
cargo run -- --help

OKOK nice!


上一篇
URL 縮短服務 - 類似 bit.ly 的 URL 縮短器
下一篇
網站健康檢查器 - 監控多個網站的可用性
系列文
Rust 實戰專案集:30 個漸進式專案從工具到服務13
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言