Python 爬取微博数据代码分享

2,399次阅读
没有评论
#encoding:utf-8
#python: 3.6.8
#编辑器:pycharm

import requests
from bs4 import BeautifulSoup
import csv
import time
import random



def get_content(url):
    # url = "https://s.weibo.com/weibo?q=python&nodup=1"
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'SINAGLOBAL=7057541101792.921.1637494526703; SUB=_2A25Mnl94DeRhGeNK4lMW9i7OyD2IHXVsYWEwrDV8PUJbkNAKLVmtkW1NSSlXpFc4PFxLatA-5y5QPIxe1xa3ZpXo; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5WpUycXw1BkhZAbewN8QBu5NHD95QfSh.pS0q7eoepWs4Dqcj_i--ciKLhiKn7i--ci-zRi-20i--NiKLWiKnXi--4i-iFiK.pi--fi-88i-2E; _s_tentry=weibo.com; Apache=826961126770.6844.1637671038964; ULV=1637671038985:2:2:2:826961126770.6844.1637671038964:1637494526810; UOR=,,www.gooseeker.com',
        'Host': 's.weibo.com',
        'Referer': 'https://s.weibo.com/weibo?q=python&page=2',
        'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
    }
    response = requests.get(url,headers=headers)
    html = BeautifulSoup(response.content,'lxml')
    conetnt = html.find_all('div',class_="card-wrap") # 这里CALSS 要加下划线
    for ct in conetnt:
        # print(ct)
        user_info = ct.find_all('a',class_="name")
        if user_info != []:
            user_name = user_info[0].text# 用户名称
            user_index = "https:"+ user_info[0]['href'] # 用户主页
            user_from = str(ct.find('p',class_="from").text).replace(' ','').replace('\n','') # 时间和发布终端设备名称
            weibo_content = str(ct.find('p',class_="txt").text).replace(' ','').replace('\n','') # 微博内容
            data = [weibo_content,user_name,user_from,user_index]
            saveCsv('微博内容', data)

def runx():

    n = 0
    for x in range(1,51):

        print(f"正在抓取第{x}页数据")
        n +=1

        url = f"https://s.weibo.com/weibo?q=python&nodup=1&page={x}"
        t = random.randint(2,5)# 随机抽取 2-5之间
        print(f"{t} 秒后开始抓取")
        time.sleep(t)

        if n%5 == 0:
            t = random.randint(5,10) # 随机抽取 5-10之间
            print(f"{t} 秒后继续抓取")
            time.sleep(t) #这里停止上面抽取出来的数值

        get_content(url)


def saveCsv(filename,content):
    fp = open(f"{filename}.csv",'a+',encoding='utf-8-sig',newline='')
    csv_fp = csv.writer(fp)
    csv_fp.writerow(content)
    fp.close()
    print(f"成功写入:{content}")





if __name__ == '__main__':

    col = ['微博内容','发布者名称','发布时间以及设备','发布者主页']
    saveCsv('微博内容', col)
    runx()

 

如果你喜欢我分享的内容,点个收藏哟!

原创面具网

3
liuze
版权声明:本站原创文章,由 liuze2022-01-29发表,共计2493字。
转载说明:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
载入中...