request bs4 模块实现简单爬虫

  • 作者:ykk
  • 分类: 随笔
  • 发表时间:2020-05-30 21:13:03
  • 阅读量:(526)

爬取省医药平台公布的数据


import os
import pandas as pd
global list
list = []
list1 = []

#输入需要爬取的页面url
url = 'http://yycg.hnsggzy.com/NoticeBoard/ShowHCGoodsInfoTen.aspx?Group=1&ShowCom=true&ShowProduct=true&ShowReg=true?Id=70290'


def wrexcel(pp, name):
    filepath = os.getcwd() + r'\%s.xlsx' % (name)
    print(filepath)
    dret = pd.DataFrame.from_records(pp)  # 元组,需要转换为列
    dret.to_excel(filepath)   # header 指定列名,index 默认为True,写行名
    print('写入数据ing。。。。')
    return 1


def btlist():
    import requests
    from bs4 import BeautifulSoup
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    items = soup.find_all(class_='list_tab')
    for item in items:
        sss = item.find_all('th')
        for s in sss:
            list1.append(s.text.replace(" ", ""))
    print(list1)


def addlist(s, p):
    import requests
    from bs4 import BeautifulSoup

    data = {
        "__VIEWSTATEGENERATOR": p,
        '__EVENTTARGET': 'pager1',
        '__EVENTARGUMENT': s}
    req = requests.post(url, data)
    soup = BeautifulSoup(req.text, 'html.parser')
    items = soup.find_all(class_='list_tab')
    for item in items:
        ii = item.find_all('td')
        for i in ii:
            list.append(i.text.replace(" ", ""))


btlist()


# 如果为63页 需要写到64
for g in range(1, 690):#共8页
    print(g)
    addlist(g, '104C460F')  # post中的data 构造请求参数,循环获取翻页数据
    pass
n = int(len(list1))
c = [list[i:i + n] for i in range(0, len(list), n)]
df = pd.DataFrame(c, columns=list1)
wrexcel(df, 'tmp20200515')  # 生成的文件名

 

 

上一篇: itchat实现微信群聊自动签到

下一篇: fastapi+sqlserver实现简单接口

评论 列表: