网站首页 文章专栏 python采集接口数据
采集其他网页接口的数据:
from queue import Queue import requests import threading import time class Producter(threading.Thread): def __init__(self, page_queue, *args, **kwargs): super(Producter, self).__init__(*args, **kwargs) self.page_queue = page_queue self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "cookie": "__guid=21700157.3803037399595994000.1551679927386.2166; _uab_collina=155376469911282939612083; _umdata=G6A95A67C9FB29FFB00378C58F506F76244C71B; Hm_lvt_932cd41aa09c90857d16a9dd6b041475=1551679966,1553764733; XSRF-TOKEN=eyJpdiI6IjgwVkpuRlFIbEp2UlhUUlJrUFY1dHc9PSIsInZhbHVlIjoibDhcL084UWZCTlFOOTlxOGpPTDFMSXBySWl4enE0Q1U0cWkrQWdmXC93MU12T21QN25YVHlJd0U0QWpTVDFhVlFjbnE1RGpGK1BwcTV1TUhrY0pod0JFUT09IiwibWFjIjoiYWNiYmU5N2FlZDY0ZmFiNDZhNjdlZjRmNDIwMmRlNTM5MmZiZjJkZTI0YTkyOWI0ZjJlMWFjNzY4MzlhOWIyYSJ9; om_session=eyJpdiI6IkVOa2FFd0g3MG1pM2pSdkhaNVhaK0E9PSIsInZhbHVlIjoiSnZQRmZDSHZCQ3ZiOEdCVTh2SUdGMWgwT1B5dXU3QXY1T1lSODRPa1k4NlpCVGpnaFZEeWNTM2hZR1U2RVhBVEZQVlF3OW45Sk5IclZ0TUkzR01KbWc9PSIsIm1hYyI6ImJiMjQyMTliMTg0YmExZGMzZjFiZTdjODdjNTYxOTQxZTgzODUxMGE5MTMwYTE1ZTRiYjhkYTY0MmE4MDlhMjEifQ%3D%3D; Hm_lvt_b971b31e4bb9cc9113595205c1048aff=1564457939; Hm_lpvt_b971b31e4bb9cc9113595205c1048aff=1564458101; monitor_count=1", } def run(self): while True: if self.page_queue.empty(): print('分页数据获取结束~') break self.parse_url(self.page_queue.get()) def parse_url(self, url): res = requests.get(url, headers=self.headers) if res.status_code == 200: text = res.text with open('success.txt', 'a+', newline='') as fp: fp.write(text+"\n") print(text) print('****' * 30) else: print('采集失败' + url) with open('error.txt', 'a+', newline='') as fp: fp.write(url + "\n") def main(): page_queue = Queue(797056) for i in range(2, 797056): # 功能只测试第20页 if i > 797056: break url = 'https://www.test.cn/v55/model/detail?id=%d' % i page_queue.put(url) # 开启5个线程执行生产者 for i in range(5): time.sleep(1) t1 = Producter(page_queue) t1.start() if __name__ == '__main__': main()
转载请注明出处