python简单爬虫开发——打造自己的sitemap生成器

心月IT博客 10-06

默认

摘要：sitemap网站地图是seo网站优化常用的一项功能，虽然网上有很多sitemap自动生成工具，但大多要么有数量限制，要么生成结果不理想，今天就来教教大家用python开发一个属于自己的sitemap生成器。

sitemap网站地图是seo网站优化常用的一项功能，虽然网上有很多sitemap自动生成工具，但大多要么有数量限制，要么生成结果不理想，今天就来教教大家用python开发一个属于自己的sitemap生成器。

sitemap生成器，主要通过爬虫爬取给定网站内的所有url链接，并把不属于目标网站的url链接剔除，最后生成xml或者txt格式的sitemap文件。

那么要实现一个爬虫需要注意什么呢？

首先需要一个调度端（启动/停止爬虫/监视爬虫的运行情况），在爬虫程序中有三个模块：

URL管理器：对将要爬取和已经爬取的url，从url管理器中取出待爬取的url，将其传送给网页下载器

网页下载器：将url指定的网页下载下来存储成字符串并传送给网页解析器

网页解析器：解析出有价值的数据，解析出指向其他网页的url并补充到url管理器

URL管理器-网页下载器-网页解析器-URL管理器:循环处理，只要url管理器中有url就会一直运行下去，最终把相关连网页中我们需要的数据全部爬取出来。

爬虫运行的流程图解：

爬虫运行流程图解

下面我们就来根据爬虫的运行流程来打造自己专属的sitemap生成器：

1、global_val.py：自定义全局模块

制作sitemap时，有些数据是不固定的，同时同一个变量会在多处使用，这就需要用到自定义全局变量了。

（python3中自定全局和跨模块使用其实挺简单的，我们可以把需要自定义的全局变量写在一个模块里，然后在需要用到这些全局变量的模块中导入自定义全局变量模块，然后用"模块名.变量名"的方式给自定义全局变量赋值或者使用）

host = None
num = None

2、爬虫主程序：

sitemap.py

import html_downloader
import html_outputer
import html_parser
import url_manager
import global_val

class SpiderMain(object):
    def __init__(self):
        self.urls = url_manager.UrlManager()    # url管理器
        self.downloader = html_downloader.HtmlDownloader()    # 页面下载器
        self.parser = html_parser.HtmlParser()    # 网页解析器
        self.outputer = html_outputer.HtmlOutPut() #内容输出器

    def craw(self, root_url):
        count = 1
        self.urls.add_new_url(root_url) # 将需要爬取的网站 url 加入 url管理器，开始执行爬虫程序
        while self.urls.has_new_url():
            try:
                new_url = self.urls.get_new_url() # 从 url管理器中获取新的待爬取 url
                print('craw %d : %s' % (count, new_url))
                html_cnt = self.downloader.download(new_url) # 下载指定 url 页面内容
                new_urls, new_data = self.parser.parse(new_url, html_cnt) # 解析指定 url 页面内容，获取数据和新的 url
                self.urls.add_new_urls(new_urls) # 将新获取的url 加入到 url管理器中
                self.outputer.collect_data(new_data) # 将重页面中获取到自己需要的数据收集起来，方便后面一起输出
                
                # 若有 url 爬取数量限制，则当达到数量限制时终止爬虫继续爬取数据
                if 0 < global_val.num == count:
                    break
                count = count + 1
            except:
                print('craw failed %s' % new_url) # 爬取失败提示


        self.outputer.output_html()

if __name__=="__main__":
    root_url = input("请输入要爬取的域名(如有设置https，请带上 https 协议):")
    if root_url.find("https://") > -1 or root_url.find("http://") > -1:
        global_val.host = root_url
    else:
        global_val.host = root_url
        root_url = "http://" + root_url + "/"
    max_num = input("请输入sitemap最大url数(不输入则不限制数量):")
    if max_num and int(max_num) > 0:
        global_val.num = int(max_num)
    else:
        global_val.num = -1
    obj_spider = SpiderMain()
    obj_spider.craw(root_url)

3、url管理器

url_manager.py

import re
import global_val
class UrlManager(object):

    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()


    def add_new_url(self, url):
        base_url = global_val.host
        # 将新的 url 加入 url管理器前先进行过滤处理
        if url is None:
            return
        if url.find("http") > -1 and url.find(base_url) < 0:
            return
        if re.compile(r":[a-zA-Z0-9();]").findall(url) \
                or url.find("#") > 0 or url.find(".xml") > 0 \
                or url.find(".jpg") > 0 or url.find(".png") > 0 \
                or url.find(".gif") > 0 or url.find(".zip") > 0 \
                or url.find(".rar") > 0 or url.find(".bmp") > 0 \
                or url.find(".doc") > 0 or url.find(".doc") > 0 \
                or url.find(".pdf") > 0 or url.find(".xls") > 0 \
                or url.find(".docx") > 0 or url.find(".xlsx") > 0:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self, urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)

    def has_new_url(self):
    """判断url管理器中是否还有url"""
        return len(self.new_urls) != 0

    def get_new_url(self):
    """获取新的待爬取url"""
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

4、网页下载器

html_downloader.py

import urllib.request
from urllib.parse import quote


class HtmlDownloader(object):
    def download(self, url):
    """ 通过urllib.request请求url ，请求有效则下载网页内容"""
        if url is None:
            return
        url = quote(url, safe="?#/:=@")
        response = urllib.request.urlopen(url)
        if response.getcode() != 200:
            return None
        return response.read()

5、页面解析器

html_parser.py

import re
import urllib.parse
import global_val
from bs4 import BeautifulSoup

class HtmlParser(object):
    def parse(self, page_url, html_cnt):
    """网页数据解析"""
        if page_url is None or html_cnt is None:
            return
        soup = BeautifulSoup(html_cnt, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(page_url, soup)
        new_data = self._get_new_data(page_url, soup)
        return  new_urls,new_data

    def _get_new_urls(self, page_url, soup):
    """从页面数据中获取新的url"""
        new_urls = set()
        links = soup.find_all('a', href=re.compile(r".*"))
        base_url = global_val.host
        for link in links:
            new_url = link['href']
            new_url_full = urllib.parse.urljoin(page_url,new_url)
            if new_url_full.find(base_url) < 0:
                continue
            new_urls.add(new_url_full)
        return new_urls

    def _get_new_data(self, page_url, soup):
    """从页面数据中获取需要爬取的url和title"""
        res_data = {}
        title_node = soup.find('title')
        res_data['title'] = title_node.get_text()
        res_data['url'] = page_url
        return res_data

6、内容输出器

HTML_outputer.py

import re
import time
import global_val

class HtmlOutPut(object):
    def __init__(self):
        self.datas = []

    def collect_data(self, data):
    
        if data is None:
            return
        self.datas.append(data)

    def output_html(self):
        root_url = global_val.host
        if not re.compile(r"http://").findall(root_url) and not re.compile(r"https://").findall(root_url):
            root_url = "http://" + root_url
        filename = global_val.host + 'sitemap.xml'
        # open打开文件时最好指定 encoding ，否则在输出数据到文件时会有各种坑
        fw = open(filename, 'w', encoding="utf-8")
        fw.write('<?xml version="1.0" encoding="UTF-8"?>')
        fw.write('\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')

        for data in self.datas:
            url = data['url']
            fw.write('\n\t<url>')
            fw.write('\n\t\t<loc>' + url + '</loc>')
            fw.write("\n\t\t<lastmod>" + time.strftime('%Y-%m-%d', time.localtime(time.time())) + "</lastmod>")
            fw.write('\n\t\t<changefreq>daily</changefreq>')
            if re.findall(root_url + "$", data['url']) or re.findall(root_url + "/$", data['url']):
                fw.write('\n\t\t<priority>1.0</priority>')
            elif re.findall(root_url + "[^/]*/$", data['url']) or re.findall(root_url + "/[^/]*/$", data['url']):
                fw.write('\n\t\t<priority>0.8</priority>')
            elif re.findall(root_url + "[^/]*/[^/]*$", data['url']) or re.findall(root_url + "/[^/]*/[^/]*$", data['url']):
                fw.write('\n\t\t<priority>0.6</priority>')
            else:
                fw.write('\n\t\t<priority>0.4</priority>')

            fw.write('\n\t</url>')

        fw.write('\n</urlset>')

到这里自己专属的sitemap生成器就算打造完成了，当然了，这里打造的sitemap生成器还是有很多地方可以继续完善的，比如 changefreq，priority，还有生成的sitemap文件类型，这里最主要的还是打造爬虫的这些思想，只要能把这些掌握吃透，继续完善还是轻而易举的，下面让我们来看看运行效果吧：

爬虫生成sitemap运行效果