Python 多執行緒 (multiprocessing)

Python 多執行緒 (multiprocessing)

multiprocessing 筆記

Process

導入套件與進程 function

1
2
3
4
5
6
import time
import multiprocessing as mp

def claw(website):
time.sleep(len(website))
print(f'{website}: hello world')

執行多進程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 建立 Process
## 當 args 的參數只要一個值時,需要加逗號,否則會報錯
work_1 = mp.Process(target=claw, args=('websiteAAAAAA',))
work_2 = mp.Process(target=claw, args=('websiteB',))

# 執行 Process
print('Start Process')
work_1.start()
work_2.start()

# 等待 Process 完成
print('Wait...')
work_1.join()
work_2.join()
print('Done!!!')

打印結果

1
2
3
4
5
Start Process
Wait...
websiteB[8]: hello world
websiteAAAAAA[13]: hello world
Done!!!

Pool 進程池

導入套件與進程 function

使用多進程爬 Google 新聞前五則

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import requests
import pandas as pd
from bs4 import BeautifulSoup
import multiprocessing as mp

def news_craw(topic):
# 取得該分類下前五則新聞標題與連結
google_news = 'https://news.google.com'
url = f'{google_news}/topics/{topic}?hl=zh-TW&gl=TW&ceid=TW%3Azh-Hant'
res = requests.get(url)
soup = BeautifulSoup(res.text.encode('utf-8'), "html.parser")
news = []
for i in range(5):
news.append({
'title': soup.select('div.xrnccd h3')[i].text,
'link': google_news + soup.select('div.xrnccd h3 a')[i]['href'][1:]
})
return news

新聞主題

1
2
3
4
5
6
google_news_topic = [
'CAAqJQgKIh9DQkFTRVFvSUwyMHZNRFptTXpJU0JYcG9MVlJYS0FBUAE', ## 台灣
'CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx1YlY4U0JYcG9MVlJYR2dKVVZ5Z0FQAQ', ## 國際
'CAAqKggKIiRDQkFTRlFvSUwyMHZNRGx6TVdZU0JYcG9MVlJYR2dKVVZ5Z0FQAQ', ## 商業
'CAAqKggKIiRDQkFTRlFvSUwyMHZNREpxYW5RU0JYcG9MVlJYR2dKVVZ5Z0FQAQ' ## 娛樂
]

執行多進程

pool.map

1
2
3
4
5
6
7
# processes 預設使用電腦核心數
with mp.Pool(processes=4) as pool:
res = pool.map(news_craw, google_news_topic)

new_list = [item for news in res for item in news]
df = pd.DataFrame(new_list)
df

pool.apply_sync

1
2
3
4
5
6
7
8
9
10
11
12
13
pool = mp.Pool(processes=4)

multiple_res = []
for topic in google_news_topic:
multiple_res.append(pool.apply_async(news_craw, (topic,)))

pool.close()
pool.join() # 待程序完成

# 使用 get 取得回傳值
new_list = [item for res in multiple_res for item in res.get()]
df = pd.DataFrame(new_list)
df

打印結果

在使用上兩者結果是一樣的

評論