Book finder · fi3ework's Studio

-- coding: utf-8 --

from bs4 import BeautifulSoup
import xlwt
from selenium import webdriver

模拟Firefox登陆

url = “https://book.douban.com/"
driver = webdriver.Firefox()
driver.get(url)

获参考户输入

book = input(“Title:”)

查找并输入 id=”inp-query”是该url搜索框的id

driver.find_element_by_id(“inp-query”).send_keys(book)

模拟点击搜索

driver.find_element_by_xpath(“//*[@type=’submit’]”).click()

进入第一个相关链接

driver.find_element_by_xpath(“//*[@class=’title-text’]”).click()

保存第一个相关链接的url(即收集数据的url)

book_url = driver.current_url

对当前页面进行bs解析

soup = BeautifulSoup(driver.page_source, ‘html.parser’)

读取本书具有的信息的名称

book_content_list = []
names = soup.find_all(‘span’,class_=”pl”)
for i in names:
book_content_list.append(i.string)

对当前页面进行修改并进行bs解析

other = driver.page_source.replace(‘‘,’‘)
others = other.replace(‘
‘,’
‘)
soup = BeautifulSoup(others, ‘html.parser’)

读取书名,作者,出版社,出版年,页数,定价等内容

book_list = []
other = soup.find(‘div’,id=”info”,class_=””)
others = other.find_all(‘a’)
for i in others:
book_list.append(i.string)
book_list = list(filter(None,book_list))
New_book_list = []
for i in book_list:
i = i.replace(‘ ‘,’’)
i = i.replace(‘n’,’’)
New_book_list.append(i)

输出本书具有的信息

length = len(New_book_list)
for i in range(0,length):
print(end = book_content_list[i])
print(New_book_list[i])

对url进行bs解析

soup = BeautifulSoup(driver.page_source,’html.parser’)

读取内容简介,作者简介

contend_list = []
New_contend_list = []
about_content = []
about_anthor = []
content = soup.find_all(‘div’,class_=”intro”)
for i in content:
contents = i.find_all(‘p’)
New_contend_list.append(contents)
if len(New_contend_list) == 3:
del(New_contend_list[1])
elif len(New_contend_list) == 4:
del(New_contend_list[0])
del(New_contend_list[2])
content = New_contend_list[0]
anthor = New_contend_list[1]
for i in content:
about_content.append(i.string)
for i in anthor:
about_anthor.append(i.string)

关闭网页

driver.close()

输出

print(end = “内容简介:”)
for i in about_content:
print(end = i)
print()
print(end = “作者简介:”)
for i in about_anthor:
print(end = i)
print()

确定购买的url

add_url = “buylinks”
buy_url = book_url + add_url

动态爬取基础步骤(一:模仿为Firefox 二:读取网站)

driver2 = webdriver.Firefox()
driver2.get(buy_url)

对buy_url进行bs解析

soup = BeautifulSoup(driver2.page_source,’html.parser’)

读取图书的各网点的报价

price_information_list = []
New_price_information_list = []
price_informations = soup.find_all(‘td’,class_=”pl2”)
for i in price_informations:
price_information = i.find(‘a’)
price_information_list.append(price_information)
price_information_list = list(filter(None,price_information_list))
for i in price_information_list:
New_price_information_list.append(i.string)

关闭网页

driver2.close()

比较那个网店的价格最便宜

min = New_price_information_list[1]
flag = 1
length = len(New_price_information_list)
for i in range(3,length):
if i%2 == 1:
if min > New_price_information_list[i]:
min = New_price_information_list[i]
flag = i

输出最便宜的网店名以及价格

print(“最便宜的网店名以及价格:%s:%s” %(New_price_information_list[flag-1],New_price_information_list[flag]))

创建一个workbook 设置编码

workbook = xlwt.Workbook(encoding = ‘utf-8’)

创建一个worksheet

worksheet = workbook.add_sheet(‘Book finder’)

写入excel

整合内容简介,作者简介

content_text = ‘’.join(about_content)
anthor_text = ‘’.join(about_anthor)

导入数据

worksheet.write(0, 0, label = ‘书名:’)
worksheet.write(0, 1, label = ‘%s’ %book)
length = len(New_book_list)
for i in range(0,length):
worksheet.write(i+1, 0, label = ‘%s’ %book_content_list[i])
worksheet.write(i+1, 1, label = ‘%s’ %New_book_list[i])
hang = length + 1
worksheet.write(hang, 0, label = ‘内容简介:’)
worksheet.write(hang, 1, label = ‘%s’ %content_text)
worksheet.write(hang+1, 0, label = ‘作者简介:’)
worksheet.write(hang+1, 1, label = ‘%s’ %anthor_text)
worksheet.write(hang+2, 0, label = ‘购买建议:’)
worksheet.write(hang+2, 1, label = ‘%s’ %New_price_information_list[flag-1])
worksheet.write(hang+2, 2, label = ‘%s’ %New_price_information_list[flag])

设置行列间隔

worksheet.col(0).width = 2569
worksheet.col(1).width = 256
22

保存

workbook.save(‘Book finder.xls’)