网页信息爬取(by jupyter)

前言

对于部分实时更新数据的网页进行信息爬取，基于python编写程序后采用PyInstaller封装成exe程序，并在云服务器上运行。

正文

查找网页html，并进行后续的数据爬取

import requests
import os
import time
import csv
import re
from datetime import datetime, timedelta

def fetch_national_surface_water_quality_daily():
    url = '目标网址'

    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'DNT': '1',
        'Origin': '目标网址',
        'Referer': '目标网址',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }

    data = {
        'AreaID': '',
        'RiverID': '',
        'MNName': '',
        'PageIndex': '1',
        'PageSize': '10000',
        'action': 'getRealDatas',
    }

    print("🔄 正在请求数据...")
    response = requests.post(url, headers=headers, data=data, timeout=30)
    response.raise_for_status()  # 抛出 HTTP 错误

    data_json = response.json()

    if 'thead' not in data_json or 'tbody' not in data_json:
        raise ValueError("返回数据结构异常，缺少thead或tbody")

    # 创建保存目录
    save_dirs = ['./数据/全国', './数据/其它']
    for d in save_dirs:
        os.makedirs(d, exist_ok=True)

    # 清洗表头
    thead = [re.sub('<.*?>', '', col) for col in data_json['thead']]

    # 清洗表格内容
    rows = []
    for row in data_json['tbody']:
        cleaned_row = [re.sub('<.*?>', '', str(col)) for col in row]
        rows.append(cleaned_row)

    # 文件名时间戳
    timestamp = time.strftime("%Y-%m-%d %H-%M", time.localtime())

    # 保存全国数据
    csv_path_all = os.path.join(save_dirs[0], f"{timestamp}.csv")
    with open(csv_path_all, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(thead)
        writer.writerows(rows)

    # 保存其它数据
    csv_path_zhj = os.path.join(save_dirs[1], f"{timestamp}.csv")
    with open(csv_path_zhj, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.writer(f)
        writer.writerow(thead)
        for row in rows:
            if len(row) > 1 and row[1] == '其它':
                writer.writerow(row)

    print("✅ 数据已成功保存！")
    print("📁 全国数据：", csv_path_all)
    print("📁 其它数据：", csv_path_zhj)

#在每日的1, 5, 9, 13, 17, 21时执行一次数据爬取
def run_on_fixed_schedule():
    schedule_hours = [1, 5, 9, 13, 17, 21]

    while True:
        now = datetime.now()
        next_run = None
        for hour in schedule_hours:
            scheduled_time = now.replace(hour=hour, minute=0, second=0, microsecond=0)
            if scheduled_time > now:
                next_run = scheduled_time
                break
        if not next_run:
            # 所有时间点都过了，设为明天第一个时间点
            next_run = now.replace(day=now.day, hour=schedule_hours[0], minute=0, second=0, microsecond=0) + timedelta(days=1)

        wait_seconds = (next_run - now).total_seconds()
        print(f"\n🕒 当前时间：{now.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"⏳ 等待至：{next_run.strftime('%Y-%m-%d %H:%M:%S')}，剩余 {int(wait_seconds)} 秒...\n")
        time.sleep(wait_seconds)

        # 重试逻辑
        max_retries = 1000
        success = False
        for attempt in range(1, max_retries + 1):
            print(f"🔁 第 {attempt} 次尝试抓取数据...")
            try:
                fetch_national_surface_water_quality_daily()
                success = True
                break
            except Exception as e:
                print(f"⚠️ 第 {attempt} 次抓取失败：{e}")
                time.sleep(10)

        if not success:
            print("❌ 连续1000次抓取失败，跳过此次。")


if __name__ == "__main__":
    run_on_fixed_schedule()

代码运行无误后采用控制台（此处以anaconda为例），在控制台内设置好代码运行环境与路径（环境需下载pyinstaller包），打包为exe程序，exe程序最终保存在dist文件夹中

1	pyinstaller --onefile data_fetcher.py

备注

1.该代码为循环持续运行，为保证运行的稳定性，不推荐在主机运行，可购买云服务器（在此以华为云为例），购买离自己较近的云端服务器，远程登录桌面后将文件复制粘贴即可

2.代码仅供技术交流，请遵守相关法律条文