#!/usr/bin/env python # -*- coding:utf-8 -*- """ Date: 2023/9/15 19:00 Desc: 请求网站内容的函数: 在链接失败后可重复 20 次 """ import time from io import StringIO from typing import Dict import pandas as pd import requests def requests_link( url: str, encoding: str = "utf-8", method: str = "get", data: Dict = None, headers: Dict = None, ): """ 利用 requests 请求网站, 爬取网站内容, 如网站链接失败, 可重复爬取 20 次 :param url: string 网站地址 :param encoding: string 编码类型: "utf-8", "gbk", "gb2312" :param method: string 访问方法: "get", "post" :param data: dict 上传数据: 键值对 :param headers: dict 游览器请求头: 键值对 :return: requests.response 爬取返回内容: response """ i = 0 while True: try: if method == "get": r = requests.get(url, timeout=20, headers=headers) r.encoding = encoding return r elif method == "post": r = requests.post(url, timeout=20, data=data, headers=headers) r.encoding = encoding return r else: raise ValueError("请提供正确的请求方式") except: # noqa: E722 i += 1 print(f"第{str(i)}次链接失败, 最多尝试 20 次") time.sleep(5) if i > 20: return None def pandas_read_html_link( url: str, encoding: str = "utf-8", method: str = "get", data: Dict = None, headers: Dict = None, ): """ 利用 pandas 提供的 read_html 函数来直接提取网页中的表格内容, 如网站链接失败, 可重复爬取 20 次 :param url: string 网站地址 :param encoding: string 编码类型: "utf-8", "gbk", "gb2312" :param method: string 访问方法: "get", "post" :param data: dict 上传数据: 键值对 :param headers: dict 游览器请求头: 键值对 :return: requests.response 爬取返回内容: response """ i = 0 while True: try: if method == "get": r = requests.get(url, timeout=20, headers=headers) r.encoding = encoding r = pd.read_html(StringIO(r.text), encoding=encoding) return r elif method == "post": r = requests.post(url, timeout=20, data=data, headers=headers) r.encoding = encoding r = pd.read_html(StringIO(r.text), encoding=encoding) return r else: raise ValueError("请提供正确的请求方式") except requests.exceptions.Timeout as e: i += 1 print(f"第{str(i)}次链接失败, 最多尝试20次", e) time.sleep(5) if i > 20: return None