网站首页 文章专栏 python采集接口数据

python采集接口数据

编辑时间:2019-10-14 17:15:20 作者:苹果 浏览量:33







采集其他网页接口的数据:

    from queue import Queue    
    import requests
    import threading
    import time
    
    class Producter(threading.Thread):
        def __init__(self, page_queue, *args, **kwargs):
            super(Producter, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                "cookie": "__guid=21700157.3803037399595994000.1551679927386.2166; _uab_collina=155376469911282939612083; _umdata=G6A95A67C9FB29FFB00378C58F506F76244C71B; Hm_lvt_932cd41aa09c90857d16a9dd6b041475=1551679966,1553764733; XSRF-TOKEN=eyJpdiI6IjgwVkpuRlFIbEp2UlhUUlJrUFY1dHc9PSIsInZhbHVlIjoibDhcL084UWZCTlFOOTlxOGpPTDFMSXBySWl4enE0Q1U0cWkrQWdmXC93MU12T21QN25YVHlJd0U0QWpTVDFhVlFjbnE1RGpGK1BwcTV1TUhrY0pod0JFUT09IiwibWFjIjoiYWNiYmU5N2FlZDY0ZmFiNDZhNjdlZjRmNDIwMmRlNTM5MmZiZjJkZTI0YTkyOWI0ZjJlMWFjNzY4MzlhOWIyYSJ9; om_session=eyJpdiI6IkVOa2FFd0g3MG1pM2pSdkhaNVhaK0E9PSIsInZhbHVlIjoiSnZQRmZDSHZCQ3ZiOEdCVTh2SUdGMWgwT1B5dXU3QXY1T1lSODRPa1k4NlpCVGpnaFZEeWNTM2hZR1U2RVhBVEZQVlF3OW45Sk5IclZ0TUkzR01KbWc9PSIsIm1hYyI6ImJiMjQyMTliMTg0YmExZGMzZjFiZTdjODdjNTYxOTQxZTgzODUxMGE5MTMwYTE1ZTRiYjhkYTY0MmE4MDlhMjEifQ%3D%3D; Hm_lvt_b971b31e4bb9cc9113595205c1048aff=1564457939; Hm_lpvt_b971b31e4bb9cc9113595205c1048aff=1564458101; monitor_count=1",
            }
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    print('分页数据获取结束~')
                    break
                self.parse_url(self.page_queue.get())
    
        def parse_url(self, url):
            res = requests.get(url, headers=self.headers)
            if res.status_code == 200:
                text = res.text
                with open('success.txt', 'a+', newline='') as fp:
                    fp.write(text+"\n")
                print(text)
                print('****' * 30)
            else:
                print('采集失败' + url)
                with open('error.txt', 'a+', newline='') as fp:
                    fp.write(url + "\n")
    
    def main():
        page_queue = Queue(797056)
    
        for i in range(2, 797056):
            # 功能只测试第20页
            if i > 797056:
                break
            url = 'https://www.test.cn/v55/model/detail?id=%d' % i
            page_queue.put(url)
    
        # 开启5个线程执行生产者
        for i in range(5):
            time.sleep(1)
            t1 = Producter(page_queue)
            t1.start()
    
    if __name__ == '__main__':
        main()



    出自:何冰华个人网站

    地址:http://www.hebinghua.com/

    转载请注明出处


来说两句吧
最新评论