Python3.2带来了concurrent.futures 模块,这个模块包含了线程池和进程池、管理并行编程任务、处理非确定性的执行流程、进程/线程同步等功能。关于这部分的内容推荐大家阅读《Python并行编程》。
协程(coroutine)通常又称之为微线程或纤程,它是相互协作的一组子程序(函数)。所谓相互协作指的是在执行函数A时,可以随时中断去执行函数B,然后又中断继续执行函数A。注意,这一过程并不是函数调用(因为没有调用语句),整个过程看似像多线程,然而协程只有一个线程执行。协程通过yield关键字和 send()操作来转移执行权,协程之间不是调用者与被调用者的关系。
- Python 2.2:第一次提出了生成器(最初称之为迭代器)的概念(PEP 255)。
- Python 2.5:引入了将对象发送回暂停了的生成器这一特性即生成器的send()方法(PEP 342)。
- Python 3.3:添加了yield from特性,允许从迭代器中返回任何值(注意生成器本身也是迭代器),这样我们就可以串联生成器并且重构出更好的生成器。
- Python 3.4:引入asyncio.coroutine装饰器用来标记作为协程的函数,协程函数和asyncio及其事件循环一起使用,来实现异步I/O操作。
- Python 3.5:引入了async和await,可以使用async def来定义一个协程函数,这个函数中不能包含任何形式的yield语句,但是可以使用return或await从协程中返回值。
- 协程实现了协作式并发,通过提高CPU的利用率来达到改善性能的目的。著名的三方库aiohttp就是通过协程的方式实现了HTTP客户端和HTTP服务器的功能,较之requests有更好的获取数据的性能,有兴趣可以阅读它的官方文档。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import asyncio import aiohttp async def download(url): print('Fetch:', url) async with aiohttp.ClientSession() as session: async with session.get(url, ssl=False) as resp: print(url, '--->', resp.status) print(url, '--->', resp.headers) print('\n\n', await resp.text()) def main(): loop = asyncio.get_event_loop() urls = [ 'https://www.baidu.com', 'http://www.sohu.com/', 'http://www.sina.com.cn/', 'https://www.taobao.com/', 'http://jd.com/' ] tasks = [download(url) for url in urls] loop.run_until_complete(asyncio.wait(tasks)) loop.close() if __name__ == '__main__': main() |
实例 – 多线程爬取“手机搜狐网”所有页面
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import pickle import zlib from enum import Enum, unique from hashlib import sha1 from random import random from threading import Thread, current_thread, local from time import sleep from urllib.parse import urlparse import pymongo import redis import requests from bs4 import BeautifulSoup from bson import Binary @unique class SpiderStatus(Enum): IDLE = 0 WORKING = 1 def decode_page(page_bytes, charsets=('utf-8',)): page_html = None for charset in charsets: try: page_html = page_bytes.decode(charset) break except UnicodeDecodeError: pass return page_html class Retry(object): def __init__(self, *, retry_times=3, wait_secs=5, errors=(Exception, )): self.retry_times = retry_times self.wait_secs = wait_secs self.errors = errors def __call__(self, fn): def wrapper(*args, **kwargs): for _ in range(self.retry_times): try: return fn(*args, **kwargs) except self.errors as e: print(e) sleep((random() + 1) * self.wait_secs) return None return wrapper class Spider(object): def __init__(self): self.status = SpiderStatus.IDLE @Retry() def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None): thread_name = current_thread().name print(f'[{thread_name}]: {current_url}') headers = {'user-agent': user_agent} if user_agent else {} resp = requests.get(current_url, headers=headers, proxies=proxies) return decode_page(resp.content, charsets) \ if resp.status_code == 200 else None def parse(self, html_page, *, domain='m.sohu.com'): soup = BeautifulSoup(html_page, 'lxml') for a_tag in soup.body.select('a[href]'): parser = urlparse(a_tag.attrs['href']) scheme = parser.scheme or 'http' netloc = parser.netloc or domain if scheme != 'javascript' and netloc == domain: path = parser.path query = '?' + parser.query if parser.query else '' full_url = f'{scheme}://{netloc}{path}{query}' redis_client = thread_local.redis_client if not redis_client.sismember('visited_urls', full_url): redis_client.rpush('m_sohu_task', full_url) def extract(self, html_page): pass def store(self, data_dict): # redis_client = thread_local.redis_client # mongo_db = thread_local.mongo_db pass class SpiderThread(Thread): def __init__(self, name, spider): super().__init__(name=name, daemon=True) self.spider = spider def run(self): redis_client = redis.Redis(host='', port=6379, password='1qaz2wsx') mongo_client = pymongo.MongoClient(host='', port=27017) thread_local.redis_client = redis_client thread_local.mongo_db = mongo_client.msohu while True: current_url = redis_client.lpop('m_sohu_task') while not current_url: current_url = redis_client.lpop('m_sohu_task') self.spider.status = SpiderStatus.WORKING current_url = current_url.decode('utf-8') if not redis_client.sismember('visited_urls', current_url): redis_client.sadd('visited_urls', current_url) html_page = self.spider.fetch(current_url) if html_page not in [None, '']: hasher = hasher_proto.copy() hasher.update(current_url.encode('utf-8')) doc_id = hasher.hexdigest() sohu_data_coll = mongo_client.msohu.webpages if not sohu_data_coll.find_one({'_id': doc_id}): sohu_data_coll.insert_one({ '_id': doc_id, 'url': current_url, 'page': Binary(zlib.compress(pickle.dumps(html_page))) }) self.spider.parse(html_page) self.spider.status = SpiderStatus.IDLE def is_any_alive(spider_threads): return any([spider_thread.spider.status == SpiderStatus.WORKING for spider_thread in spider_threads]) thread_local = local() hasher_proto = sha1() def main(): redis_client = redis.Redis(host='', port=6379, password='1qaz2wsx') if not redis_client.exists('m_sohu_task'): redis_client.rpush('m_sohu_task', 'http://m.sohu.com/') spider_threads = [SpiderThread('thread-%d' % i, Spider()) for i in range(10)] for spider_thread in spider_threads: spider_thread.start() while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads): sleep(5) print('Over!') if __name__ == '__main__': main() |
本文来自这个系列长期转载Python-100-Days ,本文观点不代表蓝洛水深立场,转载请联系原作者。