# encoding:utf-8
import requests
import httpx
import json
from bs4 import BeautifulSoup
import time
import csv
import random
requests = requests.session() # 建立一个Session
cookitext = input('请输入COOKIE:').replace('\n', '')
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': cookitext,
'referer': 'https://weibo.com/2750621294/KAf1AFVPD',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'traceparent': '00-5b5f81f871c6ff6846bf3a92f1d5efed-1ab32a39ad75711a-00',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': '7yIZGS_IPx7EteZ6TT86YYAZ',
}
def getWeiboCommentinfo(url):
"""
主要是获取微博的信息,内容以及这个微博 MID UID,
:param url:
:return:
"""
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': cookitext,
'referer': 'https://www.baidu.com/link?url=79KIn7lPAsM1SqpiE6ub8unuDW2xwxX-4CyvQvA8HLS&wd=&eqid=dfbc01160004d2dc00000004619e5581',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
}
response = requests.get(url, headers=headers)
html = BeautifulSoup(response.content, 'lxml')
conetnt = html.find_all('div', class_="card-wrap") # 这里CALSS 要加下划线
for ct in conetnt:
user_info = ct.find_all('a', class_="name")
if user_info != []:
try:
mid = ct['mid'] # 获取微博ID
except:
pass
else:
user_name = user_info[0].text # 用户名称
uid = str(ct.find('div', class_="avator").find('a')['href']).split('/')[-1].split("?")[0] # 获取UID
user_index = "https:" + user_info[0]['href'] # 用户主页
user_from = str(ct.find('p', class_="from").text).replace(' ', '').replace('\n', '') # 时间和发布终端设备名称
weibo_content = str(ct.find('p', class_="txt").text).replace(' ', '').replace('\n', '') # 微博内容
data = [weibo_content, user_name, user_from, user_index, mid, uid]
max_id = 0
htmlComment(data)
getCommentLevel1(data, max_id)
def getCommentLevel1(data, max_id):
"""
一级评论
:return:
"""
mid = data[-2]
uid = data[-1]
url = "https://weibo.com/ajax/statuses/buildComments?"
par = {
'id': mid,
'is_show_bulletin': '2',
'is_mix': '0',
'max_id': max_id,
'count': '20',
'uid': uid,
}
client = httpx.Client(http2=True, verify=False)
response = client.get(url, params=par, headers=headers)
jsondata = json.loads(response.text)
max_id = jsondata['max_id'] # 获取下一页mid
content = jsondata['data']
for ct in content:
created_at = ct['created_at'] # 评论时间
struct_time = time.strptime(created_at, '%a %b %d %H:%M:%S %z %Y') # 评论时间
time_array = time.strftime("%Y-%m-%d %H:%M:%S", struct_time) # 评论时间
text = ct['text_raw'] # 评论内容
screen_name = ct['user']['screen_name'] # 评论人名称
weibo_comment_data = data + [text, time_array, screen_name]
saveCsv("微博信息_评论", weibo_comment_data)
if max_id == 0:
pass
else:
getCommentLevel1(data, max_id)
def htmlComment(data):
mid = data[-2]
uid = data[-1]
url = 'https://s.weibo.com/Ajax_Comment/small?'
par = {
'act': 'list',
'mid': mid,
'uid': uid,
'smartFlag': 'false',
'smartCardComment': '',
'isMain': 'true',
'pageid': 'weibo',
'_t': '0',
}
client = httpx.Client(http2=True, verify=False)
response = client.get(url, params=par, headers=headers)
jsondata = json.loads(response.text)['data']['html']
html = BeautifulSoup(jsondata, 'lxml')
comment_content = html.find_all('div', class_="content")
for cc in comment_content:
comment_info = str(cc.find('div', class_='txt').text).replace('\n', '').replace(' ', '').split(':')
comment_text = comment_info[-1]
comment_user = comment_info[0]
comment_time = cc.find('p', class_="from").text
weibo_comment_data = data + [comment_text, comment_time, comment_user]
saveCsv("微博信息_评论", weibo_comment_data)
def runx():
keytext = input('请输入关键词:')
n = 0
for x in range(1, 51):
url = f"https://s.weibo.com/weibo?q={keytext}&page={x}"
t = random.randint(2,5)
print(f"{t}秒后开始抓取")
time.sleep(t)
getWeiboCommentinfo(url)
def saveCsv(filename, content):
fp = open(f"{filename}.csv", 'a+', encoding='utf-8-sig', newline='')
csv_fp = csv.writer(fp)
csv_fp.writerow(content)
fp.close()
print(f"成功写入:{content}")
if __name__ == '__main__':
runx()
如果对你有帮助,记得点个收藏呗!
来自面具技术网