多进程
import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'
# 下载书本封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, 'wb') as f:
f.write(jd.content)
f.close()
# 获取代理池中的代理
def get_proxy():
return requests.get("http://118.24.52.95/get/").json()
# 删除代理池中代理
def delete_proxy(proxy):
requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def save_to_csv(csvPath, lst):
with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
# f = open(csvPath, 'w', encoding='utf-8')
writer = csv.writer(f, dialect='excel')
for info_list in lst:
writer.writerow(info_list)
# f.close()
def get_info(url):
print(url)
# 使用开发者模式
# options = webdriver.ChromeOptions()
# options.add_argument('--user-agent=%s' % ua)
# options.add_argument('--proxy-server=http://%s' % get_proxy())
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
browser = webdriver.Chrome()
browser.get(url)
# 显示等待
try:
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
)
# print(element)
html = browser.page_source
finally:
# browser.quit()
browser.close()
print(url + "完成")
soup = BeautifulSoup(html, "html.parser")
div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
# print(soup.find_all("script"))
data_list = []
# 把数据存入列表
for each in div_list:
bookInfo = each.find("div", class_="list-card-content")
# 封面
bookPicL = str(each.find("img"))
startL = bookPicL.find('src="') + len('src="')
endL = startL + bookPicL[startL:].find('"')
bookPicUrl = bookPicL[startL:endL]
# 书名
bookName = bookInfo.div.a.text
# 作者
author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
# 标签
bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
"").replace(
'\n', '')
# 字数
wordCount = bookInfo.p.span.text
info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
data_list.append(info_list)
# 设置休眠时间
# time.sleep(1)
return data_list
def run(url):
save_to_csv(csvPath, get_info(url))
# 程序主入口
if __name__ == '__main__':
with open(csvPath, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
pool = Pool(4)
# pool.map(run,['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i))for i in range(1, 11335)])
pool.map(run, [
'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for
i in range(1, 501)])
pool.close()
pool.join()
多线程
import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from threading import Thread
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'
# 下载书本封面
def save_image_file(url, path):
jd = requests.get(url)
if jd.status_code == 200:
with open(path, 'wb') as f:
f.write(jd.content)
f.close()
# 获取代理池中的代理
def get_proxy():
return requests.get("http://118.24.52.95/get/").json()
# 删除代理池中代理
def delete_proxy(proxy):
requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))
def getHtml(url):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
while retry_count > 0:
try:
html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
# 使用代理访问
return html
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def save_to_csv(csvPath, lst):
with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
# f = open(csvPath, 'w', encoding='utf-8')
writer = csv.writer(f, dialect='excel')
for info_list in lst:
writer.writerow(info_list)
# f.close()
def get_info(url):
print(url)
# 使用开发者模式
# options = webdriver.ChromeOptions()
# options.add_argument('--user-agent=%s' % ua)
# options.add_argument('--proxy-server=http://%s' % get_proxy())
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
browser = webdriver.Chrome()
browser.get(url)
# 显示等待
try:
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
)
# print(element)
html = browser.page_source
finally:
# browser.quit()
browser.close()
print(url + "完成")
soup = BeautifulSoup(html, "html.parser")
div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
# print(soup.find_all("script"))
data_list = []
# 把数据存入列表
for each in div_list:
bookInfo = each.find("div", class_="list-card-content")
# 封面
bookPicL = str(each.find("img"))
startL = bookPicL.find('src="') + len('src="')
endL = startL + bookPicL[startL:].find('"')
bookPicUrl = bookPicL[startL:endL]
# 书名
bookName = bookInfo.div.a.text
# 作者
author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
# 标签
bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
"").replace(
'\n', '')
# 字数
wordCount = bookInfo.p.span.text
info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
data_list.append(info_list)
# 设置休眠时间
# time.sleep(1)
return data_list
def run(urls):
for url in urls:
save_to_csv(csvPath, get_info(url))
# 程序主入口
if __name__ == '__main__':
with open(csvPath, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f, dialect='excel')
writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
urls1 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls2 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls3 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
urls4 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
# 开启4个线程,传入爬取的地址
thead_list = []
t1 = Thread(target=run(urls1))
t1.start()
t2 = Thread(target=run(urls2))
t2.start()
t3 = Thread(target=run(urls3))
t3.start()
t4 = Thread(target=run(urls4))
t4.start()
thead_list.append(t1)
thead_list.append(t2)
thead_list.append(t3)
thead_list.append(t4)
for t in thead_list:
t.join()
随机UA&代理池
抱歉,只有登录并在本文发表评论才能阅读隐藏内容