通过Python Selenium多进程爬取优书网书籍信息并实时写入CSV

多进程

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(url):
    save_to_csv(csvPath, get_info(url))


# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    pool = Pool(4)
    # pool.map(run,['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i))for i in range(1, 11335)])
    pool.map(run, [
        'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for
        i in range(1, 501)])
    pool.close()
    pool.join()

多线程

import csv
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from threading import Thread
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

csvPath = 'C:/tool/dev/python/PycharmProjects/day0617/yousuu/data/yousuu.csv'


# 下载书本封面
def save_image_file(url, path):
    jd = requests.get(url)
    if jd.status_code == 200:
        with open(path, 'wb') as f:
            f.write(jd.content)
            f.close()


# 获取代理池中的代理
def get_proxy():
    return requests.get("http://118.24.52.95/get/").json()


# 删除代理池中代理
def delete_proxy(proxy):
    requests.get("http://118.24.52.95/delete/?proxy={}".format(proxy))


def getHtml(url):
    # ....
    retry_count = 5
    proxy = get_proxy().get("proxy")
    while retry_count > 0:
        try:
            html = requests.get(url, proxies={"http": "http://{}".format(proxy)})
            # 使用代理访问
            return html
        except Exception:
            retry_count -= 1
    # 出错5次, 删除代理池中代理
    delete_proxy(proxy)
    return None


def save_to_csv(csvPath, lst):
    with open(csvPath, 'a+', encoding='utf-8', newline='') as f:
        # f = open(csvPath, 'w', encoding='utf-8')
        writer = csv.writer(f, dialect='excel')
        for info_list in lst:
            writer.writerow(info_list)
    # f.close()


def get_info(url):
    print(url)
    # 使用开发者模式
    # options = webdriver.ChromeOptions()
    # options.add_argument('--user-agent=%s' % ua)
    # options.add_argument('--proxy-server=http://%s' % get_proxy())
    # options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # browser = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    browser = webdriver.Chrome()
    browser.get(url)
    # 显示等待
    try:
        element = WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div/div/div/div[2]'))
        )
        # print(element)
        html = browser.page_source
    finally:
        # browser.quit()
        browser.close()
    print(url + "完成")
    soup = BeautifulSoup(html, "html.parser")
    div_list = soup.find_all('div', class_='list-card-layout full-mode-book')
    # print(soup.find_all("script"))
    data_list = []
    # 把数据存入列表
    for each in div_list:
        bookInfo = each.find("div", class_="list-card-content")
        # 封面
        bookPicL = str(each.find("img"))
        startL = bookPicL.find('src="') + len('src="')
        endL = startL + bookPicL[startL:].find('"')
        bookPicUrl = bookPicL[startL:endL]
        # 书名
        bookName = bookInfo.div.a.text
        # 作者
        author = bookInfo.find("p").find('a', class_='author-name ellipsis').text
        # 标签
        bookLabel = bookInfo.find("p", class_="bookinfo-tags").text.replace("本书标签:", "").replace(" ", "").replace("\r",
                                                                                                                  "").replace(
            '\n', '')
        # 字数
        wordCount = bookInfo.p.span.text
        info_list = [bookName, author, bookLabel, wordCount, bookPicUrl]
        data_list.append(info_list)
    # 设置休眠时间
    # time.sleep(1)
    return data_list


def run(urls):
    for url in urls:
        save_to_csv(csvPath, get_info(url))



# 程序主入口
if __name__ == '__main__':
    with open(csvPath, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, dialect='excel')
        writer.writerow(["bookName", "author", "bookLabel", "wordCount", "bookPicUrl"])
    urls1 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls2 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls3 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    urls4 = ['https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(str(i)) for i in range(1, 3)]
    # 开启4个线程,传入爬取的地址
    thead_list = []
    t1 = Thread(target=run(urls1))
    t1.start()
    t2 = Thread(target=run(urls2))
    t2.start()
    t3 = Thread(target=run(urls3))
    t3.start()
    t4 = Thread(target=run(urls4))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()

随机UA&代理池

抱歉,只有登录并在本文发表评论才能阅读隐藏内容
点赞

发表评论

电子邮件地址不会被公开。必填项已用 * 标注