📦 yjhjstz / deep-into-node

📄 chapter10-2.md · 208 lines
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208## HTTP 2/2

http 模块提供了两个函数 http.request 和 http.get,功能是作为客户端向 HTTP服务器发起请求。

这通常来实现自己的爬虫程序, 笔者自己写的一个爬取知乎的一个例子:https://github.com/yjhjstz/iZhihu


### GET 例子
```js
const http = require("http")
http.get('http://www.baidu.com', (res) => {
  console.log(`Got response: ${res.statusCode}`);
  // consume response body
  res.resume();
}).on('error', (e) => {
  console.log(`Got error: ${e.message}`);
});
```
上面的程序会返回一个200的状态码!

### HTTP Client
Node.js 中,http.get 通过创建一个 `ClientRequest`的对象,建立与服务端的连接通信。
```js
// lib/_http_client.js
function ClientRequest(options, cb) {
  var self = this;
  OutgoingMessage.call(self);

  // ...
  const defaultPort = options.defaultPort ||
                      self.agent && self.agent.defaultPort;

  var port = options.port = options.port || defaultPort || 80;
  var host = options.host = options.hostname || options.host || 'localhost';

  if (options.setHost === undefined) {
    var setHost = true;
  }

  self.socketPath = options.socketPath;

  var method = self.method = (options.method || 'GET').toUpperCase();
  if (!common._checkIsHttpToken(method)) {
    throw new TypeError('Method must be a valid HTTP token');
  }
  self.path = options.path || '/';
  if (cb) {
    self.once('response', cb);
  }

  // ...

  var called = false;
  if (self.socketPath) {
    // ...
  } else if (self.agent) {
    // ...
  } else {
    // No agent, default to Connection:close.
    self._last = true;
    self.shouldKeepAlive = false;
    if (typeof options.createConnection === 'function') {
      const newSocket = options.createConnection(options, oncreate);
      if (newSocket && !called) {
        called = true;
        self.onSocket(newSocket);
      } else {
        return;
      }
    } else {
      debug('CLIENT use net.createConnection', options);
      self.onSocket(net.createConnection(options));
    }
  }

  function oncreate(err, socket) {
    // ...
  }

  self._deferToConnect(null, null, function() {
    self._flush();
    self = null;
  });
}

util.inherits(ClientRequest, OutgoingMessage);
```
callback 通过 `self.once('response', cb);`, 监听了 response 事件。之后如果没有设置代理服务,则默认使用
net 模块创建与服务器的连接。那么 response 事件是哪里发送的呢?

下面我们看到比较重要的 `onSocket` 函数。
```js
ClientRequest.prototype.onSocket = function(socket) {
  process.nextTick(onSocketNT, this, socket);
};

function onSocketNT(req, socket) {
  if (req.aborted) {
    // If we were aborted while waiting for a socket, skip the whole thing.
    socket.emit('free');
  } else {
    tickOnSocket(req, socket);
  }
}
```

这边 onSocket 必须是一个异步函数,大家可以仔细体会下! 同时 onSocketNT 会异常做了处理,当请求失败时,则发送 free 事件。
否则来到 `tickOnSocket`。

```js
function tickOnSocket(req, socket) {
  var parser = parsers.alloc();
  req.socket = socket;
  req.connection = socket;
  parser.reinitialize(HTTPParser.RESPONSE);
  parser.socket = socket;
  parser.incoming = null;
  parser.outgoing = req;
  req.parser = parser;

  socket.parser = parser;
  socket._httpMessage = req;

  // Setup "drain" propagation.
  httpSocketSetup(socket);

  // Propagate headers limit from request object to parser
  if (typeof req.maxHeadersCount === 'number') {
    parser.maxHeaderPairs = req.maxHeadersCount << 1;
  } else {
    // Set default value because parser may be reused from FreeList
    parser.maxHeaderPairs = 2000;
  }

  parser.onIncoming = parserOnIncomingClient;
  socket.removeListener('error', freeSocketErrorListener);
  socket.on('error', socketErrorListener);
  socket.on('data', socketOnData);
  socket.on('end', socketOnEnd);
  socket.on('close', socketCloseListener);
  req.emit('socket', socket);
}
```

同 HTTP Server 类似,从池中申请一个解析器,用于解析 HTTP 协议, 到这一步说明连接已经建立,所以重新设置 error 事件的回调。

同时设置 数据回调等,然后发送 `socket`事件, 来到 `parserOnIncomingClient`.

```js
// client
function parserOnIncomingClient(res, shouldKeepAlive) {
  var socket = this.socket;
  var req = socket._httpMessage;


  // propagate "domain" setting...
  if (req.domain && !res.domain) {
    debug('setting "res.domain"');
    res.domain = req.domain;
  }

  debug('AGENT incoming response!');

  if (req.res) {
    // We already have a response object, this means the server
    // sent a double response.
    socket.destroy();
    return;
  }
  req.res = res;

  var isHeadResponse = req.method === 'HEAD';
 
  // ...
  req.res = res;
  res.req = req;

  // add our listener first, so that we guarantee socket cleanup
  res.on('end', responseOnEnd);
  var handled = req.emit('response', res);

  // If the user did not listen for the 'response' event, then they
  // can't possibly read the data, so we ._dump() it into the void
  // so that the socket doesn't hang there in a paused state.
  if (!handled)
    res._dump();

  return isHeadResponse;
}
```

在这里发送 response 事件,参数对象 `res` 上也挂上了 `req` 对象。这样 req 和 res 就相互引用。

用户的 callback 终于得到回调。

### 总结
上面只是梳理了一个http client 主线, 实际我们很少使用该模块,而是使用第三方的 npm 包,比如
* urllib (轻量级)
* request


### 参考
*https://nodejs.org/api/http.html
*https://github.com/nodejs/node/blob/master/lib/_http_client.js
*https://github.com/nodejs/http-parser