From 0af6e2d0c8fbb606db58b213ff4d0d28a1e38f4e Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 22 Oct 2021 09:58:41 +0000 Subject: ext: switch filter_body to picohttpparser This should open the door for us to process chunked data faster by modifying the buffer in-place rather than copying to a destination buffer. --- ext/unicorn_http/picohttpparser.c.h | 670 ++++++++++++++++++++++++++++++++ ext/unicorn_http/picohttpparser.h | 92 +++++ ext/unicorn_http/unicorn_http.rl | 138 +++---- ext/unicorn_http/unicorn_http_common.rl | 10 - test/unit/test_http_parser.rb | 2 +- test/unit/test_http_parser_ng.rb | 13 +- 6 files changed, 828 insertions(+), 97 deletions(-) create mode 100644 ext/unicorn_http/picohttpparser.c.h create mode 100644 ext/unicorn_http/picohttpparser.h diff --git a/ext/unicorn_http/picohttpparser.c.h b/ext/unicorn_http/picohttpparser.c.h new file mode 100644 index 0000000..f4e295f --- /dev/null +++ b/ext/unicorn_http/picohttpparser.c.h @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase, + * Shigeo Mitsunari + * + * The software is licensed under either the MIT License (below) or the Perl + * license. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#ifdef __SSE4_2__ +#ifdef _MSC_VER +#include +#else +#include +#endif +#endif +#include "picohttpparser.h" + +#if __GNUC__ >= 3 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +#ifdef _MSC_VER +#define ALIGNED(n) _declspec(align(n)) +#else +#define ALIGNED(n) __attribute__((aligned(n))) +#endif + +#define IS_PRINTABLE_ASCII(c) ((unsigned char)(c)-040u < 0137u) + +#define CHECK_EOF() \ + if (buf == buf_end) { \ + *ret = -2; \ + return NULL; \ + } + +#define EXPECT_CHAR_NO_CHECK(ch) \ + if (*buf++ != ch) { \ + *ret = -1; \ + return NULL; \ + } + +#define EXPECT_CHAR(ch) \ + CHECK_EOF(); \ + EXPECT_CHAR_NO_CHECK(ch); + +#define ADVANCE_TOKEN(tok, toklen) \ + do { \ + const char *tok_start = buf; \ + static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \ + int found2; \ + buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \ + if (!found2) { \ + CHECK_EOF(); \ + } \ + while (1) { \ + if (*buf == ' ') { \ + break; \ + } else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \ + if ((unsigned char)*buf < '\040' || *buf == '\177') { \ + *ret = -1; \ + return NULL; \ + } \ + } \ + ++buf; \ + CHECK_EOF(); \ + } \ + tok = tok_start; \ + toklen = buf - tok_start; \ + } while (0) + +static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\1\0\1\1\1\1\1\0\0\1\1\0\1\1\0\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0" + "\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\1\1" + "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\1\0\1\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + +static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found) +{ + *found = 0; +#if __SSE4_2__ + if (likely(buf_end - buf >= 16)) { + __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges); + + size_t left = (buf_end - buf) & ~15; + do { + __m128i b16 = _mm_loadu_si128((const __m128i *)buf); + int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS); + if (unlikely(r != 16)) { + buf += r; + *found = 1; + break; + } + buf += 16; + left -= 16; + } while (likely(left != 0)); + } +#else + /* suppress unused parameter warning */ + (void)buf_end; + (void)ranges; + (void)ranges_size; +#endif + return buf; +} + +static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret) +{ + const char *token_start = buf; + +#ifdef __SSE4_2__ + static const char ALIGNED(16) ranges1[16] = "\0\010" /* allow HT */ + "\012\037" /* allow SP and up to but not including DEL */ + "\177\177"; /* allow chars w. MSB set */ + int found; + buf = findchar_fast(buf, buf_end, ranges1, 6, &found); + if (found) + goto FOUND_CTL; +#else + /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */ + while (likely(buf_end - buf >= 8)) { +#define DOIT() \ + do { \ + if (unlikely(!IS_PRINTABLE_ASCII(*buf))) \ + goto NonPrintable; \ + ++buf; \ + } while (0) + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); + DOIT(); +#undef DOIT + continue; + NonPrintable: + if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { + goto FOUND_CTL; + } + ++buf; + } +#endif + for (;; ++buf) { + CHECK_EOF(); + if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { + if ((likely((unsigned char)*buf < '\040') && likely(*buf != '\011')) || unlikely(*buf == '\177')) { + goto FOUND_CTL; + } + } + } +FOUND_CTL: + if (likely(*buf == '\015')) { + ++buf; + EXPECT_CHAR('\012'); + *token_len = buf - 2 - token_start; + } else if (*buf == '\012') { + *token_len = buf - token_start; + ++buf; + } else { + *ret = -1; + return NULL; + } + *token = token_start; + + return buf; +} + +static const char *is_complete(const char *buf, const char *buf_end, size_t last_len, int *ret) +{ + int ret_cnt = 0; + buf = last_len < 3 ? buf : buf + last_len - 3; + + while (1) { + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + CHECK_EOF(); + EXPECT_CHAR('\012'); + ++ret_cnt; + } else if (*buf == '\012') { + ++buf; + ++ret_cnt; + } else { + ++buf; + ret_cnt = 0; + } + if (ret_cnt == 2) { + return buf; + } + } + + *ret = -2; + return NULL; +} + +#define PARSE_INT(valp_, mul_) \ + if (*buf < '0' || '9' < *buf) { \ + buf++; \ + *ret = -1; \ + return NULL; \ + } \ + *(valp_) = (mul_) * (*buf++ - '0'); + +#define PARSE_INT_3(valp_) \ + do { \ + int res_ = 0; \ + PARSE_INT(&res_, 100) \ + *valp_ = res_; \ + PARSE_INT(&res_, 10) \ + *valp_ += res_; \ + PARSE_INT(&res_, 1) \ + *valp_ += res_; \ + } while (0) + +/* returned pointer is always within [buf, buf_end), or null */ +static const char *parse_token(const char *buf, const char *buf_end, const char **token, size_t *token_len, char next_char, + int *ret) +{ + /* We use pcmpestri to detect non-token characters. This instruction can take no more than eight character ranges (8*2*8=128 + * bits that is the size of a SSE register). Due to this restriction, characters `|` and `~` are handled in the slow loop. */ + static const char ALIGNED(16) ranges[] = "\x00 " /* control chars and up to SP */ + "\"\"" /* 0x22 */ + "()" /* 0x28,0x29 */ + ",," /* 0x2c */ + "//" /* 0x2f */ + ":@" /* 0x3a-0x40 */ + "[]" /* 0x5b-0x5d */ + "{\xff"; /* 0x7b-0xff */ + const char *buf_start = buf; + int found; + buf = findchar_fast(buf, buf_end, ranges, sizeof(ranges) - 1, &found); + if (!found) { + CHECK_EOF(); + } + while (1) { + if (*buf == next_char) { + break; + } else if (!token_char_map[(unsigned char)*buf]) { + *ret = -1; + return NULL; + } + ++buf; + CHECK_EOF(); + } + *token = buf_start; + *token_len = buf - buf_start; + return buf; +} + +/* returned pointer is always within [buf, buf_end), or null */ +static const char *parse_http_version(const char *buf, const char *buf_end, int *minor_version, int *ret) +{ + /* we want at least [HTTP/1.] to try to parse */ + if (buf_end - buf < 9) { + *ret = -2; + return NULL; + } + EXPECT_CHAR_NO_CHECK('H'); + EXPECT_CHAR_NO_CHECK('T'); + EXPECT_CHAR_NO_CHECK('T'); + EXPECT_CHAR_NO_CHECK('P'); + EXPECT_CHAR_NO_CHECK('/'); + EXPECT_CHAR_NO_CHECK('1'); + EXPECT_CHAR_NO_CHECK('.'); + PARSE_INT(minor_version, 1); + return buf; +} + +static const char *parse_headers(const char *buf, const char *buf_end, struct phr_header *headers, size_t *num_headers, + size_t max_headers, int *ret) +{ + for (;; ++*num_headers) { + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + break; + } else if (*buf == '\012') { + ++buf; + break; + } + if (*num_headers == max_headers) { + *ret = -1; + return NULL; + } + if (!(*num_headers != 0 && (*buf == ' ' || *buf == '\t'))) { + /* parsing name, but do not discard SP before colon, see + * http://www.mozilla.org/security/announce/2006/mfsa2006-33.html */ + if ((buf = parse_token(buf, buf_end, &headers[*num_headers].name, &headers[*num_headers].name_len, ':', ret)) == NULL) { + return NULL; + } + if (headers[*num_headers].name_len == 0) { + *ret = -1; + return NULL; + } + ++buf; + for (;; ++buf) { + CHECK_EOF(); + if (!(*buf == ' ' || *buf == '\t')) { + break; + } + } + } else { + headers[*num_headers].name = NULL; + headers[*num_headers].name_len = 0; + } + const char *value; + size_t value_len; + if ((buf = get_token_to_eol(buf, buf_end, &value, &value_len, ret)) == NULL) { + return NULL; + } + /* remove trailing SPs and HTABs */ + const char *value_end = value + value_len; + for (; value_end != value; --value_end) { + const char c = *(value_end - 1); + if (!(c == ' ' || c == '\t')) { + break; + } + } + headers[*num_headers].value = value; + headers[*num_headers].value_len = value_end - value; + } + return buf; +} + +static const char *parse_request(const char *buf, const char *buf_end, const char **method, size_t *method_len, const char **path, + size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers, + size_t max_headers, int *ret) +{ + /* skip first empty line (some clients add CRLF after POST content) */ + CHECK_EOF(); + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + } else if (*buf == '\012') { + ++buf; + } + + /* parse request line */ + if ((buf = parse_token(buf, buf_end, method, method_len, ' ', ret)) == NULL) { + return NULL; + } + do { + ++buf; + CHECK_EOF(); + } while (*buf == ' '); + ADVANCE_TOKEN(*path, *path_len); + do { + ++buf; + CHECK_EOF(); + } while (*buf == ' '); + if (*method_len == 0 || *path_len == 0) { + *ret = -1; + return NULL; + } + if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { + return NULL; + } + if (*buf == '\015') { + ++buf; + EXPECT_CHAR('\012'); + } else if (*buf == '\012') { + ++buf; + } else { + *ret = -1; + return NULL; + } + + return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); +} + +static +int phr_parse_request(const char *buf_start, size_t len, const char **method, size_t *method_len, const char **path, + size_t *path_len, int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf_start + len; + size_t max_headers = *num_headers; + int r = -2; + + *method = NULL; + *method_len = 0; + *path = NULL; + *path_len = 0; + *minor_version = -1; + *num_headers = 0; + + /* if last_len != 0, check if the request is complete (a fast countermeasure + againt slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_request(buf, buf_end, method, method_len, path, path_len, minor_version, headers, num_headers, max_headers, + &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +static const char *parse_response(const char *buf, const char *buf_end, int *minor_version, int *status, const char **msg, + size_t *msg_len, struct phr_header *headers, size_t *num_headers, size_t max_headers, int *ret) +{ + /* parse "HTTP/1.x" */ + if ((buf = parse_http_version(buf, buf_end, minor_version, ret)) == NULL) { + return NULL; + } + /* skip space */ + if (*buf != ' ') { + *ret = -1; + return NULL; + } + do { + ++buf; + CHECK_EOF(); + } while (*buf == ' '); + /* parse status code, we want at least [:digit:][:digit:][:digit:] to try to parse */ + if (buf_end - buf < 4) { + *ret = -2; + return NULL; + } + PARSE_INT_3(status); + + /* get message including preceding space */ + if ((buf = get_token_to_eol(buf, buf_end, msg, msg_len, ret)) == NULL) { + return NULL; + } + if (*msg_len == 0) { + /* ok */ + } else if (**msg == ' ') { + /* Remove preceding space. Successful return from `get_token_to_eol` guarantees that we would hit something other than SP + * before running past the end of the given buffer. */ + do { + ++*msg; + --*msg_len; + } while (**msg == ' '); + } else { + /* garbage found after status code */ + *ret = -1; + return NULL; + } + + return parse_headers(buf, buf_end, headers, num_headers, max_headers, ret); +} + +static +int phr_parse_response(const char *buf_start, size_t len, int *minor_version, int *status, const char **msg, size_t *msg_len, + struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf + len; + size_t max_headers = *num_headers; + int r; + + *minor_version = -1; + *status = 0; + *msg = NULL; + *msg_len = 0; + *num_headers = 0; + + /* if last_len != 0, check if the response is complete (a fast countermeasure + against slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_response(buf, buf_end, minor_version, status, msg, msg_len, headers, num_headers, max_headers, &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +static +int phr_parse_headers(const char *buf_start, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len) +{ + const char *buf = buf_start, *buf_end = buf + len; + size_t max_headers = *num_headers; + int r; + + *num_headers = 0; + + /* if last_len != 0, check if the response is complete (a fast countermeasure + against slowloris */ + if (last_len != 0 && is_complete(buf, buf_end, last_len, &r) == NULL) { + return r; + } + + if ((buf = parse_headers(buf, buf_end, headers, num_headers, max_headers, &r)) == NULL) { + return r; + } + + return (int)(buf - buf_start); +} + +enum { + CHUNKED_IN_CHUNK_SIZE, + CHUNKED_IN_CHUNK_EXT, + CHUNKED_IN_CHUNK_DATA, + CHUNKED_IN_CHUNK_CRLF, + CHUNKED_IN_TRAILERS_LINE_HEAD, + CHUNKED_IN_TRAILERS_LINE_MIDDLE +}; + +static int decode_hex(int ch) +{ + if ('0' <= ch && ch <= '9') { + return ch - '0'; + } else if ('A' <= ch && ch <= 'F') { + return ch - 'A' + 0xa; + } else if ('a' <= ch && ch <= 'f') { + return ch - 'a' + 0xa; + } else { + return -1; + } +} + +static +ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *_bufsz) +{ + size_t dst = 0, src = 0, bufsz = *_bufsz; + ssize_t ret = -2; /* incomplete */ + + while (1) { + switch (decoder->_state) { + case CHUNKED_IN_CHUNK_SIZE: + for (;; ++src) { + int v; + if (src == bufsz) + goto Exit; + if ((v = decode_hex(buf[src])) == -1) { + if (decoder->_hex_count == 0) { + ret = -1; + goto Exit; + } + break; + } + if (decoder->_hex_count == sizeof(size_t) * 2) { + ret = -1; + goto Exit; + } + decoder->bytes_left_in_chunk = decoder->bytes_left_in_chunk * 16 + v; + ++decoder->_hex_count; + } + decoder->_hex_count = 0; + decoder->_state = CHUNKED_IN_CHUNK_EXT; + /* fallthru */ + case CHUNKED_IN_CHUNK_EXT: + /* RFC 7230 A.2 "Line folding in chunk extensions is disallowed" */ + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] == '\012') + break; + } + ++src; + if (decoder->bytes_left_in_chunk == 0) { + if (decoder->consume_trailer) { + decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; + break; + } else { + goto Complete; + } + } + decoder->_state = CHUNKED_IN_CHUNK_DATA; + /* fallthru */ + case CHUNKED_IN_CHUNK_DATA: { + size_t avail = bufsz - src; + if (avail < decoder->bytes_left_in_chunk) { + if (dst != src) + memmove(buf + dst, buf + src, avail); + src += avail; + dst += avail; + decoder->bytes_left_in_chunk -= avail; + goto Exit; + } + if (dst != src) + memmove(buf + dst, buf + src, decoder->bytes_left_in_chunk); + src += decoder->bytes_left_in_chunk; + dst += decoder->bytes_left_in_chunk; + decoder->bytes_left_in_chunk = 0; + decoder->_state = CHUNKED_IN_CHUNK_CRLF; + } + /* fallthru */ + case CHUNKED_IN_CHUNK_CRLF: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] != '\015') + break; + } + if (buf[src] != '\012') { + ret = -1; + goto Exit; + } + ++src; + decoder->_state = CHUNKED_IN_CHUNK_SIZE; + break; + case CHUNKED_IN_TRAILERS_LINE_HEAD: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] != '\015') + break; + } + if (buf[src++] == '\012') + goto Complete; + decoder->_state = CHUNKED_IN_TRAILERS_LINE_MIDDLE; + /* fallthru */ + case CHUNKED_IN_TRAILERS_LINE_MIDDLE: + for (;; ++src) { + if (src == bufsz) + goto Exit; + if (buf[src] == '\012') + break; + } + ++src; + decoder->_state = CHUNKED_IN_TRAILERS_LINE_HEAD; + break; + default: + assert(!"decoder is corrupt"); + } + } + +Complete: + ret = bufsz - src; +Exit: + if (dst != src) + memmove(buf + dst, buf + src, bufsz - src); + *_bufsz = dst; + return ret; +} + +static +int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder) +{ + return decoder->_state == CHUNKED_IN_CHUNK_DATA; +} + +#undef CHECK_EOF +#undef EXPECT_CHAR +#undef ADVANCE_TOKEN diff --git a/ext/unicorn_http/picohttpparser.h b/ext/unicorn_http/picohttpparser.h new file mode 100644 index 0000000..3f87b64 --- /dev/null +++ b/ext/unicorn_http/picohttpparser.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2009-2014 Kazuho Oku, Tokuhiro Matsuno, Daisuke Murase, + * Shigeo Mitsunari + * + * The software is licensed under either the MIT License (below) or the Perl + * license. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef picohttpparser_h +#define picohttpparser_h + +#include + +#ifdef _MSC_VER +#define ssize_t intptr_t +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* contains name and value of a header (name == NULL if is a continuing line + * of a multiline header */ +struct phr_header { + const char *name; + size_t name_len; + const char *value; + size_t value_len; +}; + +/* returns number of bytes consumed if successful, -2 if request is partial, + * -1 if failed */ +static +int phr_parse_request(const char *buf, size_t len, const char **method, size_t *method_len, const char **path, size_t *path_len, + int *minor_version, struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* ditto */ +static +int phr_parse_response(const char *_buf, size_t len, int *minor_version, int *status, const char **msg, size_t *msg_len, + struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* ditto */ +static +int phr_parse_headers(const char *buf, size_t len, struct phr_header *headers, size_t *num_headers, size_t last_len); + +/* should be zero-filled before start */ +struct phr_chunked_decoder { + size_t bytes_left_in_chunk; /* number of bytes left in current chunk */ + char consume_trailer; /* if trailing headers should be consumed */ + char _hex_count; + char _state; +}; + +/* the function rewrites the buffer given as (buf, bufsz) removing the chunked- + * encoding headers. When the function returns without an error, bufsz is + * updated to the length of the decoded data available. Applications should + * repeatedly call the function while it returns -2 (incomplete) every time + * supplying newly arrived data. If the end of the chunked-encoded data is + * found, the function returns a non-negative number indicating the number of + * octets left undecoded, that starts from the offset returned by `*bufsz`. + * Returns -1 on error. + */ +static +ssize_t phr_decode_chunked(struct phr_chunked_decoder *decoder, char *buf, size_t *bufsz); + +/* returns if the chunked decoder is in middle of chunked data */ +static +int phr_decode_chunked_is_in_data(struct phr_chunked_decoder *decoder); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ext/unicorn_http/unicorn_http.rl b/ext/unicorn_http/unicorn_http.rl index ba23438..ea7a453 100644 --- a/ext/unicorn_http/unicorn_http.rl +++ b/ext/unicorn_http/unicorn_http.rl @@ -13,6 +13,8 @@ #include "global_variables.h" #include "c_util.h" #include "epollexclusive.h" +#include "picohttpparser.h" +#include "picohttpparser.c.h" void init_unicorn_httpdate(void); @@ -21,7 +23,6 @@ void init_unicorn_httpdate(void); #define UH_FL_INBODY 0x4 #define UH_FL_HASTRAILER 0x8 #define UH_FL_INTRAILER 0x10 -#define UH_FL_INCHUNK 0x20 #define UH_FL_REQEOF 0x40 #define UH_FL_KAVERSION 0x80 #define UH_FL_HASHEADER 0x100 @@ -52,15 +53,14 @@ struct http_parser { } start; union { unsigned int field_len; /* only used during header processing */ - unsigned int dest_offset; /* only used during body processing */ } s; VALUE buf; VALUE env; VALUE cont; /* Qfalse: unset, Qnil: ignored header, T_STRING: append */ union { - off_t content; - off_t chunk; - } len; + off_t clen; + struct phr_chunked_decoder pcd; + } bdy; }; static ID id_set_backtrace, id_is_chunked_p; @@ -250,12 +250,12 @@ static void write_value(struct http_parser *hp, } else if (f == g_http_connection) { hp_keepalive_connection(hp, v); } else if (f == g_content_length && !HP_FL_TEST(hp, CHUNKED)) { - if (hp->len.content) + if (hp->bdy.clen) parser_raise(eHttpParserError, "Content-Length already set"); - hp->len.content = parse_length(RSTRING_PTR(v), RSTRING_LEN(v)); - if (hp->len.content < 0) + hp->bdy.clen = parse_length(RSTRING_PTR(v), RSTRING_LEN(v)); + if (hp->bdy.clen < 0) parser_raise(eHttpParserError, "invalid Content-Length"); - if (hp->len.content != 0) + if (hp->bdy.clen != 0) HP_FL_SET(hp, HASBODY); hp_invalid_if_trailer(hp); } else if (f == g_http_transfer_encoding) { @@ -272,7 +272,7 @@ static void write_value(struct http_parser *hp, HP_FL_SET(hp, HASBODY); /* RFC 7230 3.3.3, 3: favor chunked if Content-Length exists */ - hp->len.content = 0; + hp->bdy.clen = 0; } else if (HP_FL_TEST(hp, CHUNKED)) { /* * RFC 7230 3.3.3, point 3 states: @@ -362,19 +362,12 @@ static void write_value(struct http_parser *hp, if (!STR_CSTR_EQ(val, "*")) rb_hash_aset(hp->env, g_path_info, val); } - action add_to_chunk_size { - hp->len.chunk = step_incr(hp->len.chunk, fc, 16); - if (hp->len.chunk < 0) - parser_raise(eHttpParserError, "invalid chunk size"); - } action header_done { finalize_header(hp); cs = http_parser_first_final; if (HP_FL_TEST(hp, HASBODY)) { HP_FL_SET(hp, INBODY); - if (HP_FL_TEST(hp, CHUNKED)) - cs = http_parser_en_ChunkedBody; } else { HP_FL_SET(hp, REQEOF); assert(!HP_FL_TEST(hp, CHUNKED) && "chunked encoding without body!"); @@ -385,37 +378,10 @@ static void write_value(struct http_parser *hp, */ goto post_exec; } - action end_trailers { cs = http_parser_first_final; goto post_exec; } - - action end_chunked_body { - HP_FL_SET(hp, INTRAILER); - cs = http_parser_en_Trailers; - ++p; - assert(p <= pe && "buffer overflow after chunked body"); - goto post_exec; - } - - action skip_chunk_data { - skip_chunk_data_hack: { - size_t nr = MIN((size_t)hp->len.chunk, REMAINING); - memcpy(RSTRING_PTR(hp->cont) + hp->s.dest_offset, fpc, nr); - hp->s.dest_offset += nr; - hp->len.chunk -= nr; - p += nr; - assert(hp->len.chunk >= 0 && "negative chunk length"); - if ((size_t)hp->len.chunk > REMAINING) { - HP_FL_SET(hp, INCHUNK); - goto post_exec; - } else { - fhold; - fgoto chunk_end; - } - }} - include unicorn_http_common "unicorn_http_common.rl"; }%% @@ -430,8 +396,8 @@ static void http_parser_init(struct http_parser *hp) hp->offset = 0; hp->start.field = 0; hp->s.field_len = 0; - hp->len.content = 0; hp->cont = Qfalse; /* zero on MRI, should be optimized away by above */ + memset(&hp->bdy.pcd, 0, sizeof(hp->bdy.pcd)); %% write init; hp->cs = cs; } @@ -454,11 +420,6 @@ http_parser_execute(struct http_parser *hp, char *buffer, size_t len) assert((void *)(pe - p) == (void *)(len - off) && "pointers aren't same distance"); - - if (HP_FL_TEST(hp, INCHUNK)) { - HP_FL_UNSET(hp, INCHUNK); - goto skip_chunk_data_hack; - } %% write exec; post_exec: /* "_out:" also goes here */ if (hp->cs != http_parser_error) @@ -676,7 +637,7 @@ static VALUE HttpParser_content_length(VALUE self) { struct http_parser *hp = data_get(self); - return HP_FL_TEST(hp, CHUNKED) ? Qnil : OFFT2NUM(hp->len.content); + return HP_FL_TEST(hp, CHUNKED) ? Qnil : OFFT2NUM(hp->bdy.clen); } /** @@ -703,8 +664,7 @@ static VALUE HttpParser_parse(VALUE self) if (hp->offset > MAX_HEADER_LEN) parser_raise(e413, "HTTP header is too large"); - if (hp->cs == http_parser_first_final || - hp->cs == http_parser_en_ChunkedBody) { + if (hp->cs == http_parser_first_final) { advance_str(data, hp->offset + 1); hp->offset = 0; if (HP_FL_TEST(hp, INTRAILER)) @@ -763,7 +723,7 @@ static VALUE HttpParser_headers(VALUE self, VALUE env, VALUE buf) static int chunked_eof(struct http_parser *hp) { - return ((hp->cs == http_parser_first_final) || HP_FL_TEST(hp, INTRAILER)); + return HP_FL_TEST(hp, INTRAILER); } /** @@ -780,7 +740,7 @@ static VALUE HttpParser_body_eof(VALUE self) if (HP_FL_TEST(hp, CHUNKED)) return chunked_eof(hp) ? Qtrue : Qfalse; - return hp->len.content == 0 ? Qtrue : Qfalse; + return hp->bdy.clen == 0 ? Qtrue : Qfalse; } /** @@ -853,6 +813,14 @@ static VALUE HttpParser_hijacked_bang(VALUE self) return self; } +static VALUE parse_trailers(struct http_parser *hp, VALUE src) +{ + hp->cs = http_parser_en_Trailers; + hp->buf = src; + return src; + /* TODO: switch to pico, here */ +} + /** * call-seq: * parser.filter_body(dst, src) => nil/src @@ -870,42 +838,50 @@ static VALUE HttpParser_hijacked_bang(VALUE self) static VALUE HttpParser_filter_body(VALUE self, VALUE dst, VALUE src) { struct http_parser *hp = data_get(self); - char *srcptr; - long srclen; - - srcptr = RSTRING_PTR(src); - srclen = RSTRING_LEN(src); + const char *srcptr = RSTRING_PTR(src); + long srclen = RSTRING_LEN(src); StringValue(dst); if (HP_FL_TEST(hp, CHUNKED)) { if (!chunked_eof(hp)) { + size_t bufsz = srclen; + char *dstptr; + ssize_t pret; + rb_str_modify(dst); rb_str_resize(dst, srclen); /* we can never copy more than srclen bytes */ - - hp->s.dest_offset = 0; - hp->cont = dst; + dstptr = RSTRING_PTR(dst); + memcpy(dstptr, srcptr, srclen); hp->buf = src; - http_parser_execute(hp, srcptr, srclen); - if (hp->cs == http_parser_error) - parser_raise(eHttpParserError, "Invalid HTTP format, parsing fails."); - - assert(hp->s.dest_offset <= hp->offset && - "destination buffer overflow"); - advance_str(src, hp->offset); - rb_str_set_len(dst, hp->s.dest_offset); - - if (RSTRING_LEN(dst) == 0 && chunked_eof(hp)) { - assert(hp->len.chunk == 0 && "chunk at EOF but more to parse"); + pret = phr_decode_chunked(&hp->bdy.pcd, dstptr, &bufsz); + if (pret >= 0) { + rb_str_modify(src); + if (pret) + memcpy(RSTRING_PTR(src), dstptr + bufsz, pret); + rb_str_set_len(src, (long)pret); + rb_str_set_len(dst, (long)bufsz); + HP_FL_SET(hp, INTRAILER); } else { - src = Qnil; + switch (pret) { + case -2: /* incomplete */ + rb_str_set_len(dst, (long)bufsz); + rb_str_set_len(src, 0); + return Qnil; + case -1: + parser_raise(eHttpParserError, "Invalid HTTP format, parsing fails."); + default: + assert(pret >= 0 && "phr_decode_chunked returned < -2"); + } } } + assert(HP_FL_TEST(hp, INTRAILER) && "INTRAILER not set"); + return parse_trailers(hp, src); } else { /* no need to enter the Ragel machine for unchunked transfers */ - assert(hp->len.content >= 0 && "negative Content-Length"); - if (hp->len.content > 0) { - long nr = MIN(srclen, hp->len.content); + assert(hp->bdy.clen >= 0 && "negative Content-Length"); + if (hp->bdy.clen > 0) { + long nr = MIN(srclen, hp->bdy.clen); rb_str_modify(dst); rb_str_resize(dst, nr); @@ -918,8 +894,8 @@ static VALUE HttpParser_filter_body(VALUE self, VALUE dst, VALUE src) */ hp->buf = src; memcpy(RSTRING_PTR(dst), srcptr, nr); - hp->len.content -= nr; - if (hp->len.content == 0) { + hp->bdy.clen -= nr; + if (hp->bdy.clen == 0) { HP_FL_SET(hp, REQEOF); hp->cs = http_parser_first_final; } @@ -995,7 +971,7 @@ void Init_unicorn_http(void) * it is highly unlikely to encounter clients that send more than * several kilobytes at once. */ - rb_define_const(cHttpParser, "CHUNK_MAX", OFFT2NUM(UH_OFF_T_MAX)); + rb_define_const(cHttpParser, "CHUNK_MAX", SIZET2NUM(SIZE_MAX)); /* * The maximum size of the body as specified by Content-Length. diff --git a/ext/unicorn_http/unicorn_http_common.rl b/ext/unicorn_http/unicorn_http_common.rl index 0988b54..7570433 100644 --- a/ext/unicorn_http/unicorn_http_common.rl +++ b/ext/unicorn_http/unicorn_http_common.rl @@ -56,16 +56,6 @@ value_cont = lws+ content* >start_value %write_cont_value; message_header = ((field_name ":" lws* field_value)|value_cont) :> CRLF; - chunk_ext_val = token*; - chunk_ext_name = token*; - chunk_extension = ( ";" " "* chunk_ext_name ("=" chunk_ext_val)? )*; - last_chunk = "0"+ chunk_extension CRLF; - chunk_size = (xdigit* [1-9a-fA-F] xdigit*) $add_to_chunk_size; - chunk_end = CRLF; - chunk_body = any >skip_chunk_data; - chunk_begin = chunk_size chunk_extension CRLF; - chunk = chunk_begin chunk_body chunk_end; - ChunkedBody := chunk* last_chunk @end_chunked_body; Trailers := (message_header)* CRLF @end_trailers; FullRequest = Request_Line (message_header)* CRLF @header_done; diff --git a/test/unit/test_http_parser.rb b/test/unit/test_http_parser.rb index 697af44..68d48b8 100644 --- a/test/unit/test_http_parser.rb +++ b/test/unit/test_http_parser.rb @@ -859,7 +859,7 @@ class HttpParserTest < Test::Unit::TestCase # need to update this when 128-bit machines come out # n.b. actual struct size on 64-bit is 56 bytes + 40 bytes for RVALUE # Ruby <= 2.2 objspace did not count the 40-byte RVALUE, 2.3 does. - assert_operator n, :<=, 96 + assert_operator n, :<=, 104 # TODO: drop to <= 96 assert_operator n, :>, 0 end rescue LoadError diff --git a/test/unit/test_http_parser_ng.rb b/test/unit/test_http_parser_ng.rb index 425d5ad..40fe2e3 100644 --- a/test/unit/test_http_parser_ng.rb +++ b/test/unit/test_http_parser_ng.rb @@ -230,8 +230,10 @@ class HttpParserNgTest < Test::Unit::TestCase tmp = "" assert_nil @parser.filter_body(tmp, str << "..") assert_equal "..", tmp - assert_nil @parser.filter_body(tmp, str << "abcd\r\n0\r\n") + assert_nil @parser.filter_body(tmp, str << "abcd") assert_equal "abcd", tmp + @parser.filter_body(tmp, str << "\r\n0\r\n") + assert_equal "", tmp assert_equal str.object_id, @parser.filter_body(tmp, str << "PUT").object_id assert_equal "PUT", str assert ! @parser.keepalive? @@ -318,7 +320,7 @@ class HttpParserNgTest < Test::Unit::TestCase "1\r\na\r\n2\r\n..\r\n0\r\n" assert_equal req, @parser.parse tmp = '' - assert_nil @parser.filter_body(tmp, str) + @parser.filter_body(tmp, str) assert_equal 'a..', tmp rv = @parser.filter_body(tmp, str) assert_equal rv.object_id, str.object_id @@ -357,7 +359,8 @@ class HttpParserNgTest < Test::Unit::TestCase assert_equal 'Content-MD5', req['HTTP_TRAILER'] assert_nil req['HTTP_CONTENT_MD5'] tmp = '' - assert_nil @parser.filter_body(tmp, str) + # assert_nil @parser.filter_body(tmp, str) + @parser.filter_body(tmp, str) assert_equal 'a..', tmp md5_b64 = [ Digest::MD5.digest(tmp) ].pack('m').strip.freeze rv = @parser.filter_body(tmp, str) @@ -387,7 +390,7 @@ class HttpParserNgTest < Test::Unit::TestCase assert_equal 'Content-MD5', req['HTTP_TRAILER'] assert_nil req['HTTP_CONTENT_MD5'] tmp = '' - assert_nil @parser.filter_body(tmp, str) + @parser.filter_body(tmp, str) assert_equal 'a..', tmp md5_b64 = [ Digest::MD5.digest(tmp) ].pack('m').strip.freeze rv = @parser.filter_body(tmp, str) @@ -471,7 +474,7 @@ class HttpParserNgTest < Test::Unit::TestCase assert_equal req, @parser.parse assert_equal 'Transfer-Encoding', req['HTTP_TRAILER'] tmp = '' - assert_nil @parser.filter_body(tmp, str) + @parser.filter_body(tmp, str) assert_equal 'a..', tmp assert_equal '', str str << "Transfer-Encoding: identity\r\n\r\n" -- cgit v1.2.3-24-ge0c7