import asyncio
import aiohttp
from bs4 import BeautifulSoup
import pandas as pd
from fake_useragent import UserAgent
import aiomysql
# 使用 Semaphore 限制并发数
semaphore = asyncio.Semaphore(10) # 最大并发 10
# 代理信息
proxyHost = "www.16yun.cn"
proxyPort = "5445"
proxyUser = "16QMSOML"
proxyPass = "280651"
# 构造代理 URL
PROXY = f"http://{proxyUser}:{proxyPass}@{proxyHost}:{proxyPort}"
# 随机 User-Agent
ua = UserAgent()
# 数据库配置
DB_CONFIG = {
'host': 'localhost',
'port': 3306,
' user': 'your_username',
'password': 'your_password',
'db': 'your_database',
'charset': 'utf8mb4'
}
# 数据存储优化:异步数据库写入
async def save_to_db(data):
conn = await aiomysql.connect(**DB_CONFIG)
async with conn.cursor() as cur:
await cur.executemany("INSERT INTO finance_data (column1, column2, column3) VALUES (%s, %s, %s)", data)
await conn.commit()
conn.close()
# 爬取单个股票数据
async def crawl_stock(stock_code, session):
async with semaphore:
url = f"https://finance.sina.com.cn/stock/{stock_code}.html"
HEADERS = {"User-Agent": ua.random}
async with session.get(url, headers=HEADERS, proxy=PROXY) as response:
html = await response.text()
data = parse(html)
return data
# 解析网页内容
def parse(html):
soup = BeautifulSoup(html, 'html.parser')
# 假设数据在特定的表格中
table = soup.find('table', {'class': 'example'})
data = []
for row in table.find_all('tr'):
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
return data
# 主函数
async def main(stock_codes):
async with aiohttp.ClientSession() as session:
tasks = [crawl_stock(stock_code, session) for stock_code in stock_codes]
all_data = await asyncio.gather(*tasks)
# 扁平化数据
flat_data = [item for sublist in all_data for item in sublist]
# 异步批量写入数据库
await save_to_db(flat_data)
# 示例股票代码列表
stock_codes = [
'000001',
'000002',
# 更多股票代码
]
# 运行爬虫
asyncio.run(main(stock_codes))


