蒋勋的红楼梦

发表于 2019-06-16 更新于 2026-01-02 分类于 scrapy

最近在听蒋勋的细说红楼梦，那温柔的台普，那细腻的解读，那新颖的角度，那穿插在解说其中的娓娓道来的个人经历
但是我搜了几个在线的FM，都没有资源，不得已在一个微信公众号上听，体验不是太好
于是我想在网上找资源下载下来
翻来翻去，找到了这个看着很古老的网站，我打开chrome的调试，音频资源直接大大方方地暴露在外面，没有任何反爬措施，奈斯

于是就有了下面这个爬虫，其实代码主要是从这里抄的

# -*- coding: utf-8 -*-
import os
from contextlib import closing
import threading
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}

out_dir = './out'  # 输出文件夹
thread_num = 4    # 线程数

if not os.path.exists(out_dir):
    os.mkdir(out_dir)

def download(img_url, img_name):
    if os.path.isfile(os.path.join(out_dir, img_name)):
        return
    with closing(requests.get(img_url, stream=True, headers=headers)) as r:
        rc = r.status_code
        if 299 < rc or rc < 200:
            print 'returnCode%s\t%s' % (rc, img_url)
            return
        content_length = int(r.headers.get('content-length', '0'))
        if content_length == 0:
            print 'size0\t%s' % img_url
            return
        with open(os.path.join(out_dir, img_name), 'wb') as f:
            for data in r.iter_content(1024):
                f.write(data)

def get_imgurl_generate():
    for i in range(1, 161):
        yield ("http://mp3.aikeu.com/15626/{}.mp3".format(i), "{}.mp3".format(i))

lock = threading.Lock()

def loop(imgs):
    print 'thread %s is running...' % threading.current_thread().name

    while True:
        try:
            with lock:
                img_url, img_name = next(imgs)
        except StopIteration:
            break
        try:
            download(img_url, img_name)
        except:
            print 'exceptfail\t%s' % img_url
    print 'thread %s is end...' % threading.current_thread().name

img_gen = get_imgurl_generate()

for i in range(0, thread_num):
    t = threading.Thread(target=loop, name='LoopThread%s' %i, args=(img_gen,))
    t.start()