[Day26]_Threads 貼文備份-#3：自動化下載實體檔案

2025 iThome 鐵人賽

DAY 26

佛心分享-我的私藏工具箱

告別重複瑣事： n8n workflow 自動化工作實踐系列第 26 篇

17th鐵人賽 n8n workflow

ayao

團隊火箭隊 v2025

2025-09-27 21:45:03

133 瀏覽

分享至

在本篇文章中，我們將延續之前的設定，完成最關鍵的一步：將貼文資料轉換為實體的 Markdown、圖片和影片檔案

使用 Puppeteer 處理貼文資料

延續上一篇，下個節點同樣選擇「Puppeteer」的「Run Custom Script」
- 記得 Options 的區塊也要填上設定

程式碼填寫如下

const items = $items();
const results = [];

// === 設定選項 ===
const DOWNLOAD_VIDEOS = true;
const DOWNLOAD_IMAGES = true;
const MAX_RETRIES = 2;

// === 日誌收集系統 ===
const executionLogs = [];

function log(message, type = "info") {
  const timestamp = new Date().toLocaleTimeString("zh-TW", {
    hour12: false,
    timeZone: "Asia/Taipei",
  });
  const logEntry = `[${timestamp}] ${message}`;

  console.log(logEntry);
  executionLogs.push({
    timestamp: timestamp,
    message: message,
    type: type,
    fullMessage: logEntry,
  });
}

// 等待函數
async function waitFor(ms) {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

// 檢查頁面是否仍然有效
async function isPageValid() {
  try {
    if ($page.isClosed()) return false;
    await $page.evaluate(() => document.title);
    return true;
  } catch (error) {
    return false;
  }
}

// 安全的頁面評估
async function safeEvaluate(func, ...args) {
  try {
    if (!(await isPageValid())) throw new Error("Page context is detached");
    return await $page.evaluate(func, ...args);
  } catch (error) {
    log(`❌ Page evaluation failed: ${error.message}`, "error");
    throw error;
  }
}

// 將 Threads URL 轉換為 Threadster URL
function convertToThreadsterUrl(threadsUrl) {
  try {
    // 簡單地將 threads.com 替換為 threadster.net
    const threadsterUrl = threadsUrl.replace(
      /threads\.com/g,
      "threadster.net"
    );
    log(`🔗 Converted URL: ${threadsUrl} -> ${threadsterUrl}`);
    return threadsterUrl;
  } catch (error) {
    log(`❌ Error converting URL: ${error.message}`, "error");
    return threadsUrl; // 回傳原始 URL 作為備援
  }
}

async function pollForDownloadLinks(maxAttempts = 15, intervalMs = 3000) {
  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
    try {
      log(
        `🔍 Checking for download links (attempt ${attempt}/${maxAttempts})...`
      );
      if (!(await isPageValid()))
        throw new Error("Page context detached during polling");

      const pageStatus = await safeEvaluate(() => {
        const status = {
          url: window.location.href,
          hasErrorMsg: false,
          errorMsgText: "",
          hasDownloadButtons: false,
          hasLoadingIndicator: false,
          downloadButtonsCount: 0,
        };

        // 檢查錯誤訊息
        const errorElements = document.querySelectorAll(
          '.error__msg, .error, .alert-danger, [class*="error"]'
        );
        if (errorElements.length > 0) {
          for (let elem of errorElements) {
            if (elem.style.display !== "none" && elem.offsetHeight > 0) {
              status.hasErrorMsg = true;
              status.errorMsgText += elem.textContent.trim() + " ";
            }
          }
        }

        // 檢查下載按鈕
        status.downloadButtonsCount = document.querySelectorAll(
          'a.download__item__info__actions__button, .download-btn, [href*="download"]'
        ).length;
        status.hasDownloadButtons = status.downloadButtonsCount > 0;

        // 檢查載入指示器
        status.hasLoadingIndicator = Array.from(
          document.querySelectorAll('.loading, .spinner, [class*="loading"]')
        ).some((el) => el.style.display !== "none");

        return status;
      });

      log(
        `📊 Page status - URL: ${pageStatus.url.slice(0, 50)}... | Buttons: ${
          pageStatus.downloadButtonsCount
        } | Loading: ${pageStatus.hasLoadingIndicator}`
      );

      if (pageStatus.hasErrorMsg && pageStatus.errorMsgText) {
        log(`❌ Threadster error: ${pageStatus.errorMsgText}`, "error");
        return [];
      }

      if (pageStatus.hasDownloadButtons) {
        const downloadLinks = await safeEvaluate(() => {
          const links = [];
          for (const button of document.querySelectorAll(
            "a.download__item__info__actions__button"
          )) {
            const href = button.href;
            if (!href || !href.startsWith("http")) continue;

            let type = "unknown";
            const row = button.closest("tr");
            const typeText = (
              row?.querySelector("td:nth-child(2)")?.textContent || ""
            ).toLowerCase();
            const urlPath = new URL(href).pathname.toLowerCase();

            if (typeText.includes("video") || urlPath.includes("/video")) {
              type = "video";
            } else if (
              typeText.includes("photo") ||
              typeText.includes("image") ||
              urlPath.includes("/image")
            ) {
              type = "image";
            }

            const resolution =
              row?.querySelector("td:first-child")?.textContent.trim() ||
              "Unknown";
            links.push({
              url: href,
              text: `${type} (${resolution})`,
              type: type,
              resolution: resolution,
            });
          }
          return links;
        });

        const validLinks = downloadLinks.filter(
          (link) => link.type !== "unknown"
        );
        if (validLinks.length > 0) {
          log(
            `✅ Successfully extracted ${validLinks.length} valid download links.`
          );
          return validLinks;
        }
      }

      if (pageStatus.hasLoadingIndicator) {
        log(`⏳ Still processing, waiting...`);
      }

      await waitFor(intervalMs);
    } catch (error) {
      log(`❌ Error during polling: ${error.message}`, "error");
      if (error.message.includes("detached")) return [];
      if (attempt === maxAttempts) return [];
      await waitFor(intervalMs);
    }
  }
  return [];
}

async function getDownloadLinksFromThreadster(threadsUrl, retryCount = 0) {
  try {
    log(
      `🔗 Getting download links for: ${threadsUrl} (attempt ${
        retryCount + 1
      })`
    );

    // 設定瀏覽器環境
    await $page.setViewport({ width: 1366, height: 768 });
    await $page.setUserAgent(
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    );

    // 轉換 URL 並直接導航
    const threadsterUrl = convertToThreadsterUrl(threadsUrl);
    log(`🌐 Navigating directly to: ${threadsterUrl}`);

    await $page.goto(threadsterUrl, {
      waitUntil: "networkidle0",
      timeout: 30000,
    });

    // 稍等一下讓頁面完全載入
    await waitFor(3000);

    const currentUrl = await $page.url();
    log(`📄 Current page URL: ${currentUrl}`);

    // 檢查頁面基本狀態
    const pageInfo = await safeEvaluate(() => {
      return {
        title: document.title,
        hasError:
          document.body.textContent.toLowerCase().includes("error") ||
          document.body.textContent.toLowerCase().includes("not found") ||
          document.body.textContent.toLowerCase().includes("404"),
        bodyText: document.body.textContent.slice(0, 200),
      };
    });

    log(`📊 Page info - Title: ${pageInfo.title}`);
    log(`📊 Has error: ${pageInfo.hasError}`);

    if (pageInfo.hasError) {
      log(`❌ Error page detected`, "error");
      return [];
    }

    // 檢查下載連結
    return await pollForDownloadLinks(10, 4000);
  } catch (error) {
    log(`❌ Error with threadster.net: ${error.message}`, "error");

    if (error.message.includes("detached") && retryCount < MAX_RETRIES) {
      log(`🔄 Retrying...`);
      await waitFor(5000);
      return await getDownloadLinksFromThreadster(threadsUrl, retryCount + 1);
    }

    return [];
  }
}

// 取得檔案副檔名的輔助函式
function getFileExtension(url, defaultExt) {
  try {
    const pathname = new URL(url).pathname;
    const lastPart = pathname.substring(pathname.lastIndexOf("/") + 1);
    const ext = lastPart.substring(lastPart.lastIndexOf(".") + 1);
    if (ext && lastPart.includes(".")) {
      return ext.toLowerCase();
    }
  } catch (e) {}
  if (url.includes("video")) return "mp4";
  if (url.includes("image")) return "jpg";
  return defaultExt;
}

// 主要處理邏輯
for (const [index, item] of items.entries()) {
  try {
    const data = item.json;
    const threadData = data.thread;
    const sourceUrl = data.url;

    const hasImages = (threadData.images || []).length > 0;
    const hasVideos = (threadData.videos || []).length > 0;
    const timestamp = threadData.published_on;
    const formattedDate = new Date(timestamp * 1000)
      .toISOString()
      .split("T")[0];
    const filePrefix = `${formattedDate}_${timestamp}`;

    log(
      `\n🔄 Processing Item ${index + 1}/${items.length}: Thread ${timestamp}`
    );
    log(
      `📊 Content: Images: ${
        hasImages ? (threadData.images || []).length : 0
      }, Videos: ${hasVideos ? "Yes" : "No"}`
    );

    // **收集媒體檔案資訊**
    const mediaFiles = {
      images: [],
      videos: [],
    };

    // **第一步：處理圖片 - 使用獨立 HTTP 請求下載**
    if (hasImages && DOWNLOAD_IMAGES) {
      const images = threadData.images || [];
      for (const [i, imageUrl] of images.entries()) {
        try {
          log(`🖼️ Downloading image ${i + 1}/${images.length}: ${imageUrl}`);

          // 使用新的 page 下載圖片，避免干擾主頁面
          const newPage = await $page.browser().newPage();
          try {
            await newPage.setUserAgent(
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            );
            const response = await newPage.goto(imageUrl, {
              waitUntil: "networkidle0",
              timeout: 30000,
            });

            if (!response || !response.ok()) {
              log(
                `❌ Failed to download image: ${response?.status()} ${response?.statusText()}`,
                "error"
              );
              continue;
            }

            const imageBuffer = await response.buffer();

            // **檢查 buffer 是否有效**
            if (!imageBuffer || imageBuffer.length === 0) {
              log(`❌ Downloaded image buffer is empty`, "error");
              continue;
            }

            const ext = getFileExtension(imageUrl, "jpg");
            const fileName = `${filePrefix}_image_${i + 1}.${ext}`;
            const relativePath = `images/${fileName}`;

            // 記錄媒體檔案資訊，用於 Markdown 引用
            mediaFiles.images.push({
              fileName: fileName,
              relativePath: relativePath,
              originalUrl: imageUrl,
              index: i + 1,
            });

            results.push({
              json: {
                fileType: "image",
                fileName: fileName,
                uploadPath: relativePath,
                originalUrl: imageUrl,
              },
              binary: {
                data: {
                  data: imageBuffer,
                  fileName: fileName,
                  mimeType: `image/${ext}`,
                },
              },
            });

            log(
              `✅ Generated binary item for image: ${fileName} (${imageBuffer.length} bytes)`
            );
          } finally {
            await newPage.close(); // 確保關閉新頁面
          }
        } catch (imageError) {
          log(
            `❌ Error downloading image ${i + 1}: ${imageError.message}`,
            "error"
          );
          continue;
        }
      }
    }

    // **第二步：處理影片 - 透過 threadster.net 取得下載連結**
    if (hasVideos && DOWNLOAD_VIDEOS) {
      log(`🎬 Getting video download links from threadster.net...`);
      const downloadLinks = await getDownloadLinksFromThreadster(sourceUrl);
      const videoLinks = downloadLinks.filter(
        (link) => link.type === "video"
      );

      if (videoLinks.length > 0) {
        const bestVideoLink =
          videoLinks.find((l) => l.text.toLowerCase().includes("best")) ||
          videoLinks[0];
        const ext = getFileExtension(bestVideoLink.url, "mp4");
        const fileName = `${filePrefix}_video_1.${ext}`;
        const relativePath = `videos/${fileName}`;

        // 記錄影片檔案資訊，用於 Markdown 引用
        mediaFiles.videos.push({
          fileName: fileName,
          relativePath: relativePath,
          downloadUrl: bestVideoLink.url,
          resolution: bestVideoLink.resolution,
        });

        results.push({
          json: {
            fileType: "video",
            fileName: fileName,
            downloadUrl: bestVideoLink.url,
            uploadPath: relativePath,
          },
        });

        log(`🎬 Generated item for video download: ${fileName}`);
      } else {
        log(`⚠️ No video download links found`, "warning");
      }
    }

    // **第三步：產生包含媒體引用的 Markdown 內容**
    const markdownFileName = `${filePrefix}.md`;

    // 建構媒體引用的 Markdown 語法
    let mediaMarkdown = "";

    // 加入圖片引用
    if (mediaFiles.images.length > 0) {
      mediaMarkdown += "\n## 📸 圖片\n\n";
      for (const image of mediaFiles.images) {
        mediaMarkdown += `![圖片 ${image.index}](${image.relativePath})\n\n`;
      }
    }

    // 加入影片引用
    if (mediaFiles.videos.length > 0) {
      mediaMarkdown += "\n## 🎬 影片\n\n";
      for (const video of mediaFiles.videos) {
        // 使用 HTML video 標籤以支援播放
        mediaMarkdown += `<video controls>\n  <source src="${video.relativePath}" type="video/mp4">\n  您的瀏覽器不支援影片播放。\n</video>\n\n`;
        // 也可以用 Markdown 連結語法
        mediaMarkdown += `[📹 下載影片: ${video.fileName}](${video.relativePath})\n\n`;
        if (video.resolution) {
          mediaMarkdown += `*解析度: ${video.resolution}*\n\n`;
        }
      }
    }

    // 準備 tags 陣列
    const tags = data.output && data.output.tags ? data.output.tags : [];
    const tagsYaml =
      tags.length > 0 ? tags.map((tag) => `  - "${tag}"`).join("\n") : "  []";

    const fullMarkdownContent = `---
      title: "${data.output.title.replace(/"/g, '\\"')}"
      date: ${formattedDate}
      source: ${sourceUrl}
      author: ${threadData.username}
      like_count: ${threadData.like_count || 0}
      images: ${mediaFiles.images.length}
      videos: ${mediaFiles.videos.length}
      tags:
      ${tagsYaml}
      ---

      ${threadData.text}
      ${mediaMarkdown}
      ---

      *本文由 Threads 自動工具備份產生*  
      *原始貼文：[${sourceUrl}](${sourceUrl})*
    `;

    const markdownBuffer = Buffer.from(fullMarkdownContent, "utf8");

    results.push({
      json: {
        fileType: "markdown",
        fileName: markdownFileName,
        uploadPath: markdownFileName,
        mediaFiles: mediaFiles, // 包含媒體檔案資訊供後續使用
      },
      binary: {
        data: {
          data: markdownBuffer,
          fileName: markdownFileName,
          mimeType: "text/markdown",
        },
      },
    });

    log(
      `📝 Generated binary item for ${markdownFileName} with ${mediaFiles.images.length} images and ${mediaFiles.videos.length} videos`
    );
  } catch (itemError) {
    log(
      `❌ Error processing item ${index + 1}: ${itemError.message}`,
      "error"
    );
    console.error(itemError.stack);
    continue;
  }
}

// 將日誌附加到第一個項目上
if (results.length > 0) {
  results[0].json.executionLogs = executionLogs;
}

return results;

這個腳本是整個備份流程的大腦，它會負責以下幾項重要任務：
1. 直接下載圖片：腳本會直接請求原始圖片 URL，並將其轉換為二進制檔案
2. 解析影片連結：由於 Threads 影片的複雜性，我們會透過 threadster.net 這個第三方服務來取得影片的實際下載連結
3. 產生 Markdown 文件：將貼文的文字內容、作者、發布日期、按讚數等數據，以及前面下載好的圖片和影片引用，全部整合到一個結構化的 Markdown (.md) 檔案中

使用 Switch 節點進行檔案分流

由於上一步產生了不同類型的檔案（圖片、影片、Markdown），我們需要一個「Switch」節點來進行分流，確保每種類型的檔案都走正確的處理路徑

下個節點選擇「Switch」來根據不同檔案格式做分流，變數填上
```
{
  {
    $json.fileType;
  }
}
```

透過 HTTP Request 下載影片

對於影片檔案，我們在上一步只取得了下載連結（URL），還沒有真正下載。現在，我們需要為影片的路徑新增一個「HTTP Request」節點

接著在「0」的路徑選擇「HTTP Request」

方法為「GET」，URL 填寫如下，並新增「Options」的「Response」

{
  {
    $json.downloadUrl
      ? $json.downloadUrl
      : "data:application/octet-stream;base64,";
  }
}

image 3.png

Response Format 設定為「File」，Put Output in Field 寫上「data」

使用 Merge 節點匯集所有檔案

處理完所有檔案後，我們需要將它們重新匯集在一起。新增一個「Merge」節點，並將 Switch 節點分流出去的所有路徑（圖片、處理完的影片、Markdown）全部連接到這個 Merge 節點上，如此一來，無論貼文包含哪種類型的媒體，最終都會在這裡被整合，準備進行最後的儲存步驟

下個選擇「Merge」節點，命名為「Merge binary」
接著把剛剛「Switch」的「1」、「2」路徑都接到「Merge」的「input1」，而現在的流程看起來會像這樣
再來就找個有圖片或影片的貼文放到「Edit Fields」裡面，然後點選正下方的「Execute workflow」來試跑看看，可以看到最後的節點會幫我們把檔案產出來