📦 himself65 / LuoguCrawler

📄 example_codedownload.py · 118 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118#!/usr/bin/env python
from luogu import *
from bs4 import BeautifulSoup

import asyncio
import os
import ssl

cookie = 'UM_distinctid=1628f18d8fa568-0c77f61d2d6685-336c7b05-100200-1628f18d8fb74e; __client_id=4481c1bb80e250e3e1b43eb658c1f4882b4c98a5; CNZZDATA5476811=cnzz_eid%3D657620256-1522818985-%26ntime%3D1528115158'
ID = 72813  # 这里写你的id
mainUrl = 'https://www.luogu.org'
pageUrl = 'https://www.luogu.org/recordnew/lists?uid=' + str(ID) + '&page='
downloadPath = 'download/'
codePath = downloadPath + 'code/'

DEBUG = True

# browser
browser = LuoguBrowser()
browser.insert_headers('cookie', cookie)


def getPageUrl(pageNum):
    return pageUrl + str(pageNum)


def saveLocal(fileName, text):
    path = codePath + fileName + '.cpp'
    if os.path.exists(path):
        return
    f = open(path, mode='w')
    f.write(text)
    f.close()


def downloadCode(url):
    browser.openURL(url)
    data = browser.getData()
    html = browser.ungzip(data).decode()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        text = soup.find('code').get_text()
        name = soup.find('h1').get_text()
        saveLocal(name, text)
        print('下载完成:%s' % url)
        return True
    except AttributeError:
        print('下载异常:%s' % url)
        return False


def searchPage(start, end):
    """ [start, end)
    """
    count = 0
    for i in range(start, end):
        if DEBUG:
            print("现在是第%d页" % i)
        url = getPageUrl(i)
        browser.openURL(url)
        data = browser.getData()
        html = browser.ungzip(data).decode()
        soup = BeautifulSoup(html, 'html.parser')
        items = soup.find('div', {
            'class': 'lg-content-table-left'
        }).find_all('div', {'class': 'am-g lg-table-bg0 lg-table-row'})
        for item in items:
            point = item.find('strong', {'class': 'lg-fg-green'})
            if point is None:
                continue
            acurl = item.find_all('a', {
                'target': '_blank',
                'data-pjax': ''
            })[0]['href']
            import re
            if re.search(acurl, '/record/show?rid=*'):
                if DEBUG:
                    print(acurl)
                continue
            if downloadCode(mainUrl + acurl):
                count += 1
    print('代码共', count)


def main():
    page = 1  # start Page
    url = getPageUrl(page)
    browser.openURL(url)
    data = browser.getData()
    html = browser.ungzip(data).decode()
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.find('ul', {
        'class': 'am-pagination am-pagination-centered'
    }).find_all('li')
    lastestItem = items[-1]
    maxPage = lastestItem.find('a')['data-ci-pagination-page']
    # 找到最大页码
    print('最大页数', maxPage)
    searchPage(1, int(maxPage) + 1)


def init():
    print('初始化中')
    if not os.path.exists(downloadPath):
        print('正在创建文件夹download...')
        os.makedirs(downloadPath)
        print('done...')
    if not os.path.exists(codePath):
        print('正在创建task文件')
        os.makedirs(codePath)
        print('done...')
    print('初始化完成')


if __name__:
    ssl._create_default_https_context = ssl._create_unverified_context
    init()
    main()