2023年11月25日发(作者:)

python实现百度⽂库⾃动化爬取

⽬录

项⽬介绍

已有功能

环境安装

Windows⽤看这⾥

ubuntu⽤户看这⾥

使⽤⽅式:

主要代码

项⽬地址

项⽬介绍

可以下载docpptpdf.对于doc⽂档可以下载,doc中的表格⽆法下载,图⽚格式的⽂档也可以下载.pptpdf是先下载图⽚再放到ppt中.只要是可以预

览的都可以下载。

已有功能

将可以预览的word⽂档下载为word⽂档,如果⽂档是扫描件,同样⽀持.

将可以预览的pptpdf下载为不可编辑的ppt,因为⽹页上只有图⽚,所以理论上⽆法下载可编辑的版本.

环境安装

pip install requests

pip install my_fake_useragent

pip install python-docx

pip install opencv-python

pip install python-pptx

pip install selenium

pip install scrapy

本项⽬使⽤的是chromedriver控制chrome浏览器进⾏数据爬取的的,chromedriver的版本和chrome需要匹配

Windows⽤看这⾥

1. 如果你的chrome浏览器版本恰好是87.0.4280,那么恭喜你,你可以直接看使⽤⽅式了,因为我下载的chromedriver也是这个版本

3. ⽤解压好的替换原有⽂件,然后跳到使⽤⽅式

ubuntu⽤户看这⾥

讲道理,你已经⽤ubuntu了,那位就默认你是⼤神,你只要根据chrome的版本下载对应的chromdriver(linux系统的),然后把chromedriver的路径改称你下

载解压的⽂件路径就好了,然后跳到使⽤⽅式。哈哈哈,我这⾥就偷懒不讲武德啦

使⽤⽅式:

把代码中的url改为你想要下载的链接地址,脚本会⾃动⽂档判断类型,并把在当前⽬录新建⽂件夹并把⽂件下载到当前⽬录。

主要代码

import os

import time

from selenium import webdriver

from d_capabilities import DesiredCapabilities

from scrapy import Selector

import requests

from my_fake_useragent import UserAgent

import docx

from import Inches

import cv2

from pptx import Presentation

from import Inches

#dows是的chromedriver

chromedriver_path = "./"

#ubuntuchromedriver

# chromedriver_path = "./chromedriver"

doc_dir_path = "./doc"

ppt_dir_path = "./ppt"

# url = "/view/?fr=search"# doc_txt p

# url = "/view/" # doc_txt span

# url = "/view/?fr=search" # doc_txt span br

# url = '/view/062edabeb6360b4c2e3f5727a5e9856a5712262d?pcf=2&bfetype=new' # doc_img

# url = "/view/2af6de34a7e9856a561252d380eb6294dd88228d"# vip限定doc

# url = "/view/?fr=search" #ppt

# url = "/view/18a8bc08094e767f5acfa1c7aa00b52acec79c55"#pdf

# url = "/view/bbe27bf21b5f312b3169a45177232f60dccce772"

# url = "/view/?fr=search"

# url = "/view/"

# url = "/view/"

url = "/view/"

class DownloadImg():

def __init__(self):

= UserAgent()

def download_one_img(self, img_url, saved_path):

# 下载图⽚

header = {

"User-Agent": "{}".format(().strip()),

'Connection': 'close'}

r = (img_url, headers=header, stream=True)

print("请求图⽚状态码 {}".format(_code)) # 返回状态码

if _code == 200: # 写⼊图⽚

with open(saved_path, mode="wb") as f:

(t)

print("download {} success!".format(saved_path))

del r

# a = 3333

# return sel

def create_ppt(self, ppt_dir_path, sel):

# 如果⽂件夹不存在就创建⼀个

if not (ppt_dir_path):

rs(ppt_dir_path)

SLD_LAYOUT_TITLE_AND_CONTENT = 6 # 6代表ppt模版为空

prs = Presentation() # 实例化ppt

# # 获取完整html

# sel = _html_data()

# 获取标题

xpath_title = "//div[@class='doc-title']/text()"

title = "".join((xpath_title).extract()).strip()

# 获取内容

xpath_content_p = "//div[@class='content singlePage wk-container']/div/p/img"

xpath_content_p_list = (xpath_content_p)

xpath_content_p_url_list=[]

for imgs in xpath_content_p_list:

xpath_content = "./@data-loading-src|./@data-src|./@src"

contents_list = (xpath_content).extract()

xpath_content_p_url_(contents_list)

img_path_list = [] # 保存下载的图⽚路径,⽅便后续图⽚插⼊ppt和删除图⽚

# 下载图⽚到指定⽬录

for index, content_img_p in enumerate(xpath_content_p_url_list):

p_img_path_list=[]

for index_1,img_one in enumerate(content_img_p):

one_img_saved_path = (ppt_dir_path, "{}_{}.jpg".format(index,index_1))

# 获取⽂章内容

xpath_content = "//div[contains(@data-id,'div_class_')]//p"

# xpath_content = "//div[contains(@data-id,'div_class_')]/p"

contents = (xpath_content)

# 判断内容类别

xpath_content_one = _doc(contents)

if xpath_content_th("text()"): # 如果是⽂字就直接爬

for content_one in contents:

one_p_list = content_(xpath_content_one).extract()

p_txt = ""

for p in one_p_list:

if p==" ":

p_txt += ('n'+p)