Telegram 簡易爬蟲 Bot

python系列文章 python入門 python 3 爬蟲 telegram bot
sn0wl3r0ker 2020-05-04 22:40:38 ‧ 8376 瀏覽
分享至
#尚未編輯、刪除多餘的code，改天有空完成再來補齊文章。
暫時也沒搭配flask、webhook，都可刪除，丟Heroku用worker即可運行。
import requests, re, random, configparser, telegram, os
from bs4 import BeautifulSoup
from telegram.ext import Updater, CommandHandler, CallbackQueryHandler, Filters, MessageHandler
from telegram import InlineKeyboardMarkup, InlineKeyboardButton
import logging, time
from flask import Flask, request, app
from imgurpython import ImgurClient


# -*- coding: utf-8 -*-
# import sys
# reload(sys)
# sys.setdefaultencoding('utf8')

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


config = configparser.ConfigParser()
config.read('config.ini')

# Initial Flask app
#app = Flask(__name__)
#server = Flask(__name__)

# Initial bot by Telegram access token
bot = telegram.Bot(token=(config['TELEGRAM']['ACCESS_TOKEN']))
# TOKEN = "YOUR_TELEGRAM_BOT_TOKEN_HERE"
client_id = config['imgur_api']['Client_ID']
client_secret = config['imgur_api']['Client_Secret']
album_id = config['imgur_api']['Album_ID']



# @server.route("/")
# def webhook(bot, update):
#     port = os.getenv('PORT', default=5000)
#     bot.remove_webhook()
#     bot.set_webhook(url='https://HEROKU_APP_NAME.herokuapp.com/' + TOKEN)
#     return "!", 200

# @server.route('/hook', methods=['POST'])
# def webhook_handler():
#     """Set route /hook with POST method will trigger this method."""
#     if request.method == "POST":
#         update = telegram.Update.de_json(request.get_json(force=True), bot)
#         dispatcher.process_update(update)
#     return 'ok'

def start(bot, update):
    update.message.reply_text(
        '安安 {} 您好!我是一個Crawler Bot\n除此之外我也能像鸚鵡一樣學你說話喔!\n'
        '如需幫助可以用 /help 查看命令!'.format(update.message.from_user.first_name))

def help(bot, update):
    bot.sendMessage(chat_id=update.message.chat_id, text='指令如下:\n /start - 打招呼\n'
    ' /help - 顯示所有指令\n /button - 顯示按鈕\n /beauty - 隨便來張ptt正妹版圖\n /oilp - 查詢現在油價\n'
    ' /ptthot - 抓些ptt近期熱門廢文\n /movie - 抓些近期電影資訊\n /meme - 隨機來張迷因圖\n /movieyt - 隨機'
    '來一部觸電網YT預告\n /pantech - 爬panx科技網上的文章\n /panhot - 爬pansci網今日熱門文章\n ')
def button(bot, update):
    query = update.callback_query
    query.edit_message_text(text="請點擊此: {}".format(query.data))

def openb(bot, update):
    keyboard = [
        [ #row 1
            InlineKeyboardButton('PTT正妹版爬圖', url = '~~暫時刪除~~'),
            InlineKeyboardButton('隨便來張ptt正妹版圖', callback_data='/beauty')
        ],
        [ #row 2
            InlineKeyboardButton('查詢油價', callback_data='/oilp'),
            InlineKeyboardButton('幫助Help', callback_data='/help'),
        ],
        [
            InlineKeyboardButton('抓些ptt近期熱門廢文', callback_data='/ptthot'),
            InlineKeyboardButton('抓些近期電影資訊', callback_data='/movie')
        ],
        [
            InlineKeyboardButton('隨機來點迷因圖', callback_data='/meme'),
            InlineKeyboardButton('隨機來一部觸電網YT預告片', callback_data='/movieyt')
        ],
        [
            InlineKeyboardButton('打聲招呼', callback_data='/start'),
            InlineKeyboardButton('聯繫作者', url='~~暫時刪除~~')
        ]
            ]
    reply_markup = InlineKeyboardMarkup(keyboard)
    bot.sendMessage(chat_id=update.message.chat_id, text='選項如下:', reply_markup=reply_markup)

def get_page_number(content):
    start_index = content.find('index')
    end_index = content.find('.html')
    page_number = content[start_index + 5: end_index]
    return int(page_number) + 1

def ptt_beauty(bot, update):
    image = requests.get('http://ptt-images.herokuapp.com/api/image/random/')
    url = image.json().get('Url')
    bot.sendPhoto(chat_id=update.message.chat_id, photo=url)
    #bot.sendMessage(chat_id=update.message.chat_id, text='原始imgur網址(可反向google找原文):'+url)

def ptt_hot(bot, update):
    target_url = 'http://disp.cc/b/PttHot'
    #print('Start parsing pttHot....')
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    soup = BeautifulSoup(res.text, 'html.parser')
    content = ""
    for data in soup.select('#list div.row2 div span.listTitle'):
        title = data.text
        link = "http://disp.cc/b/" + data.find('a')['href']
        if data.find('a')['href'] == "796-59l9":
            break
        content += '{}\n{}\n\n'.format(title, link)
    bot.sendMessage(chat_id=update.message.chat_id, text='熱門廢文如下:\n' + content)

#define油價查詢
def oil_price(bot, update):
    target_url = 'https://gas.goodlife.tw/'
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    title = soup.select('#main')[0].text.replace('\n', '').split('(')[0]
    gas_price = soup.select('#gas-price')[0].text.replace('\n\n\n', '').replace(' ', '')
    cpc = soup.select('#cpc')[0].text.replace(' ', '')
    content = '{}\n{}{}'.format(title, gas_price, cpc)
    bot.sendMessage(chat_id=update.message.chat_id, text='查詢油價如下:\n'+content)

def movie(bot, update):
    target_url = 'http://www.atmovies.com.tw/movie/next/0/'
    #print('Start parsing movie ...')
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    content = ""
    for index, data in enumerate(soup.select('ul.filmListAll')):
        if index == 30:
            break
        #title正則表達式尋找alt在group1中
        title = str(re.search(r'alt="(.*?)"', str(data)).group(1))
        #找url
        result = re.search(r'\/movie\/(.*)\/\"\>\<img', str(data))
        link = str(result.group(1))
        links = "http://www.atmovies.com.tw/movie/" + link +'\n'
        content += '{}\n{}\n'.format(title, links)
    bot.sendMessage(chat_id=update.message.chat_id, text='最近電影如下:\n'+content)

def meme(bot, update):
    client = ImgurClient(client_id, client_secret)
    images = client.get_album_images(album_id)
    index = random.randint(0, len(images) - 1)
    url = images[index].link
    print(url)
    bot.sendMessage(chat_id=update.message.chat_id, text="迷因圖可能爬取較久~請稍等!!")
    bot.sendPhoto(chat_id=update.message.chat_id, photo=url)

def movie_yt(bot, update):
    target_url = 'https://www.youtube.com/user/truemovie1/videos'
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    soup = BeautifulSoup(res.text, 'html.parser')
    yturl = ['https://www.youtube.com{}'.format(data.find('a')['href']) for data in soup.select('.yt-lockup-title')]
    bot.sendMessage(chat_id=update.message.chat_id, text=yturl[random.randint(0, len(yturl) - 1)])


def panhot(bot, update):
    target_url = 'https://pansci.asia/hots/day'
    # print('Start parsing panhot....')
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    soup = BeautifulSoup(res.text, 'html.parser')
    content = ""
    for data in soup.select('td.title a'):
        title = data.text
        link = data['href']
        content += '{}\n{}\n\n'.format(title, link)
    bot.sendMessage(chat_id=update.message.chat_id, text="今日熱門文章如下:\n"+content)

def pantech(bot, update):
    target_url = 'https://panx.asia/'
    # print('Start parsing pantech....')
    rs = requests.session()
    res = rs.get(target_url, verify=False)
    soup = BeautifulSoup(res.text, 'html.parser')
    content = ""
    for data in soup.select('div.container div.row div.desc_wrap h2 a'):
        title = data.text
        link = data['href']
        content += '{}\n{}\n\n'.format(title, link)
    bot.sendMessage(chat_id=update.message.chat_id, text="近期文章如下:\n"+content)

def echo(bot, update):
    text = update.message.text  # 取得對話的內容
    update.message.reply_text(text)


# def error(bot, update):
#     """Log Errors caused by Updates."""
#     logger.warning('Update "%s" caused error "%s"', bot, update.error)

# use_context這邊照作者github來說應該改成True，但不知是不是版本問題，目前這樣code改了會出問題。
def main():
    # updater = Updater('YOUR_TELEGRAM_BOT_TOKEN_HERE', use_context=True)
    updater = Updater(token='YOUR_TELEGRAM_BOT_TOKEN_HERE', use_context=False)
    # port = os.getenv('PORT', default=5000)
    # updater.start_webhook(port=port)
    updater.dispatcher.add_handler(CallbackQueryHandler(button))
    updater.dispatcher.add_handler(CommandHandler('button', openb))
    updater.dispatcher.add_handler(CommandHandler('help', help))
    updater.dispatcher.add_handler(CommandHandler('start', start))
    updater.dispatcher.add_handler(CommandHandler('beauty', ptt_beauty))
    updater.dispatcher.add_handler(CommandHandler('oilp', oil_price))
    updater.dispatcher.add_handler(CommandHandler('ptthot', ptt_hot))
    updater.dispatcher.add_handler(CommandHandler('movie', movie))
    updater.dispatcher.add_handler(CommandHandler('meme', meme))
    updater.dispatcher.add_handler(CommandHandler('movieyt', movie_yt))
    updater.dispatcher.add_handler(CommandHandler('pantech', pantech))
    updater.dispatcher.add_handler(CommandHandler('panhot', panhot))
    updater.dispatcher.add_handler(MessageHandler(Filters.text, echo))
    # on noncommand i.e message - echo the message on Telegram
    # add handlers
    # set_webhook()
    # updater.dispatcher.process_update(webhook_handler)
    # updater.dispatcher.process_update(webhook)
    #log all errors
    # updater.dispatcher.add_error_handler(error)

    #Start the Bot
    # add handlers
    # updater.start_polling(poll_interval=1.0, timeout=20)
    updater.start_polling()
    updater.idle()

if __name__ == '__main__':
    main()
    #server.run(debug=True, host='0.0.0.0',port=5000)
目前Bot長這樣: https://t.me/PSBCBot
不過只是為了學習、好玩而已，不一定會開著。之後會再補實際使用的截圖之類的。