1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151#!/usr/bin/python3
#-*- coding:utf-8 -*-
"""
简单的爬虫项目来访问洛谷官网
"""
from __future__ import absolute_import
from http import cookiejar
from urllib import parse, request
from bs4 import BeautifulStoneSoup
import json
import socket
import gzip
__author__ = "Himself65"
__license__ = "MIT"
defaultURL = "https://www.luogu.org"
from random import sample
user_agents = [
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
]
def get_agent():
"""
每次随机返回一个user-agent
"""
return sample(user_agents, 1)[0]
class LuoguBrowser(object):
"""
"""
_headers = {}
def __init__(self):
"""
初始化访问洛谷
"""
self._headers['user-agent'] = get_agent()
self._headers['accept-encoding'] = "gzip, deflate, br"
self._headers['accept-language'] = "zh,en;q=0.9,zh-CN;q=0.8,ja;q=0.7"
self.setOpener()
def insert_headers(self, key, value):
"""
插入值到请求头
每次插入后会自动setOpener
"""
self._headers[key] = value
self.setOpener()
def setOpener(self):
""" 初始化opener
"""
cj = cookiejar.CookieJar()
pro = request.HTTPCookieProcessor(cj)
self.opener = request.build_opener(pro)
header = []
for key, value in self._headers.items():
elem = (key, value)
header.append(elem)
self.opener.addheaders = header
def openURL(self, url, data=None, timeout=None):
"""
访问地址
"""
import socket
if timeout is None:
timeout = socket._GLOBAL_DEFAULT_TIMEOUT
if url is None:
raise AttributeError('url is none')
# url = url.encode('UTF8', errors='strict')
self.response = self.opener.open(url, data=data, timeout=timeout)
def getData(self):
"""
获取response内容
"""
return self.response.read()
def getResponse(self):
"""
获取response
"""
return self.response
@staticmethod
def create_query_string_message(dictionary):
"""
创建请求地址
Args:
dictionary -> 字典,例如:
Returns -> String
例如:
{
"id": 761282619,
"name": "himself65"
}
Return -> "id=761282619&name=himself65"
"""
s = ""
for (key, value) in dictionary.items():
s = s + str(key) + "=" + str(value) + "&"
return s.rstrip("&")
@staticmethod
def getDataFromResponse(response, data='more'):
"""
获取response请求中特定内容
Args:
response -> Response
data -> String, 需要获得的数据,默认为'more'
Returns -> String
"""
messages = json.loads(response)
return messages[data]
@staticmethod
def check_Accessible(data, name='code', accessStatus=200):
"""
检查状态值是否成功
Args:
data -> Dictionary, 为Response返回的请求
Returns -> Bool
"""
data_json = json.loads(data)
return data_json[name] == accessStatus if True else False
@staticmethod
def ungzip(data):
"""
ungzip the data
"""
try:
ungzipData = gzip.decompress(data)
except:
print("解压失败,返回原数据")
return data
return ungzipData