Add HTML

2023-12-11 19:48:31 +01:00 · 2023-12-11 19:48:31 +01:00 · 3088bcfa22
commit 3088bcfa22
parent bb561d2e33
11 changed files with 1434 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -7,3 +7,4 @@ Languages:
 * nix: https://github.com/nix-community/tree-sitter-nix (MIT)
 * python: https://github.com/tree-sitter/tree-sitter-python (MIT)
 * rust: https://github.com/tree-sitter/tree-sitter-rust (MIT)
+* html: https://github.com/tree-sitter/tree-sitter-html (MIT)
--- a/flake.nix
+++ b/flake.nix
@ -44,6 +44,7 @@
        packages.nix = compile-tree-sitter { src = ./nix; name = "nix"; };
        packages.python = compile-tree-sitter { src = ./python; name = "python"; };
        packages.rust = compile-tree-sitter { src = ./rust; name = "rust"; };
+        packages.html = compile-tree-sitter { src = ./html; name = "html"; };
      }
    );
 }
--- a/html/LICENSE
+++ b/html/LICENSE
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Max Brunsfeld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/html/grammar.js
+++ b/html/grammar.js
@ -0,0 +1,143 @@
+/**
+ * @file HTML grammar for tree-sitter
+ * @author Max Brunsfeld
+ * @license MIT
+ */
+
+/* eslint-disable arrow-parens */
+/* eslint-disable camelcase */
+/* eslint-disable-next-line spaced-comment */
+/// <reference types="tree-sitter-cli/dsl" />
+// @ts-check
+
+module.exports = grammar({
+  name: 'html',
+
+  extras: $ => [
+    $.comment,
+    /\s+/,
+  ],
+
+  externals: $ => [
+    $._start_tag_name,
+    $._script_start_tag_name,
+    $._style_start_tag_name,
+    $._end_tag_name,
+    $.erroneous_end_tag_name,
+    '/>',
+    $._implicit_end_tag,
+    $.raw_text,
+    $.comment,
+  ],
+
+  rules: {
+    fragment: $ => repeat($._node),
+
+    doctype: $ => seq(
+      '<!',
+      alias($._doctype, 'doctype'),
+      /[^>]+/,
+      '>',
+    ),
+
+    _doctype: _ => /[Dd][Oo][Cc][Tt][Yy][Pp][Ee]/,
+
+    _node: $ => choice(
+      $.doctype,
+      $.entity,
+      $.text,
+      $.element,
+      $.script_element,
+      $.style_element,
+      $.erroneous_end_tag,
+    ),
+
+    element: $ => choice(
+      seq(
+        $.start_tag,
+        repeat($._node),
+        choice($.end_tag, $._implicit_end_tag),
+      ),
+      $.self_closing_tag,
+    ),
+
+    script_element: $ => seq(
+      alias($.script_start_tag, $.start_tag),
+      optional($.raw_text),
+      $.end_tag,
+    ),
+
+    style_element: $ => seq(
+      alias($.style_start_tag, $.start_tag),
+      optional($.raw_text),
+      $.end_tag,
+    ),
+
+    start_tag: $ => seq(
+      '<',
+      alias($._start_tag_name, $.tag_name),
+      repeat($.attribute),
+      '>',
+    ),
+
+    script_start_tag: $ => seq(
+      '<',
+      alias($._script_start_tag_name, $.tag_name),
+      repeat($.attribute),
+      '>',
+    ),
+
+    style_start_tag: $ => seq(
+      '<',
+      alias($._style_start_tag_name, $.tag_name),
+      repeat($.attribute),
+      '>',
+    ),
+
+    self_closing_tag: $ => seq(
+      '<',
+      alias($._start_tag_name, $.tag_name),
+      repeat($.attribute),
+      '/>',
+    ),
+
+    end_tag: $ => seq(
+      '</',
+      alias($._end_tag_name, $.tag_name),
+      '>',
+    ),
+
+    erroneous_end_tag: $ => seq(
+      '</',
+      $.erroneous_end_tag_name,
+      '>',
+    ),
+
+    attribute: $ => seq(
+      $.attribute_name,
+      optional(seq(
+        '=',
+        choice(
+          $.attribute_value,
+          $.quoted_attribute_value,
+        ),
+      )),
+    ),
+
+    attribute_name: _ => /[^<>"'/=\s]+/,
+
+    attribute_value: _ => /[^<>"'=\s]+/,
+
+    // An entity can be named, numeric (decimal), or numeric (hexacecimal). The
+    // longest entity name is 29 characters long, and the HTML spec says that
+    // no more will ever be added.
+    entity: _ => /&(#([xX][0-9a-fA-F]{1,6}|[0-9]{1,5})|[A-Za-z]{1,30});/,
+
+    quoted_attribute_value: $ => choice(
+      seq('\'', optional(alias(/[^']+/, $.attribute_value)), '\''),
+      seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"'),
+    ),
+
+    text: _ => /[^<>&\s]([^<>&]*[^<>&\s])?/,
+  },
+});
--- a/html/src/scanner.c
+++ b/html/src/scanner.c
@ -0,0 +1,452 @@
+#include "tag.h"
+
+#include <wctype.h>
+
+enum TokenType {
+    START_TAG_NAME,
+    SCRIPT_START_TAG_NAME,
+    STYLE_START_TAG_NAME,
+    END_TAG_NAME,
+    ERRONEOUS_END_TAG_NAME,
+    SELF_CLOSING_TAG_DELIMITER,
+    IMPLICIT_END_TAG,
+    RAW_TEXT,
+    COMMENT
+};
+
+typedef struct {
+    uint32_t len;
+    uint32_t cap;
+    Tag *data;
+} tags_vec;
+
+typedef struct {
+    tags_vec tags;
+} Scanner;
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define VEC_RESIZE(vec, _cap)                                                  \
+    if ((_cap) > (vec).cap && (_cap) > 0) {                                    \
+        void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0]));       \
+        assert(tmp != NULL);                                                   \
+        (vec).data = tmp;                                                      \
+        (vec).cap = (_cap);                                                    \
+    }
+
+#define VEC_GROW(vec, _cap)                                                    \
+    if ((vec).cap < (_cap)) {                                                  \
+        VEC_RESIZE((vec), (_cap));                                             \
+    }
+
+#define VEC_PUSH(vec, el)                                                      \
+    if ((vec).cap == (vec).len) {                                              \
+        VEC_RESIZE((vec), MAX(16, (vec).len * 2));                             \
+    }                                                                          \
+    (vec).data[(vec).len++] = (el);
+
+#define VEC_POP(vec)                                                           \
+    {                                                                          \
+        if (VEC_BACK(vec).type == CUSTOM) {                                    \
+            tag_free(&VEC_BACK(vec));                                          \
+        }                                                                      \
+        (vec).len--;                                                           \
+    }
+
+#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
+
+#define VEC_FREE(vec)                                                          \
+    {                                                                          \
+        if ((vec).data != NULL)                                                \
+            free((vec).data);                                                  \
+        (vec).data = NULL;                                                     \
+    }
+
+#define VEC_CLEAR(vec)                                                         \
+    {                                                                          \
+        for (int i = 0; i < (vec).len; i++) {                                  \
+            tag_free(&(vec).data[i]);                                          \
+        }                                                                      \
+        (vec).len = 0;                                                         \
+    }
+
+#define STRING_RESIZE(vec, _cap)                                               \
+    void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0]));     \
+    assert(tmp != NULL);                                                       \
+    (vec).data = tmp;                                                          \
+    memset((vec).data + (vec).len, 0,                                          \
+           (((_cap) + 1) - (vec).len) * sizeof((vec).data[0]));                \
+    (vec).cap = (_cap);
+
+#define STRING_GROW(vec, _cap)                                                 \
+    if ((vec).cap < (_cap)) {                                                  \
+        STRING_RESIZE((vec), (_cap));                                          \
+    }
+
+#define STRING_PUSH(vec, el)                                                   \
+    if ((vec).cap == (vec).len) {                                              \
+        STRING_RESIZE((vec), MAX(16, (vec).len * 2));                          \
+    }                                                                          \
+    (vec).data[(vec).len++] = (el);
+
+#define STRING_INIT(vec)                                                       \
+    {                                                                          \
+        (vec).data = calloc(1, sizeof(char) * 17);                             \
+        (vec).len = 0;                                                         \
+        (vec).cap = 16;                                                        \
+    }
+
+#define STRING_FREE(vec)                                                       \
+    {                                                                          \
+        if ((vec).data != NULL)                                                \
+            free((vec).data);                                                  \
+        (vec).data = NULL;                                                     \
+    }
+
+#define STRING_CLEAR(vec)                                                      \
+    {                                                                          \
+        (vec).len = 0;                                                         \
+        memset((vec).data, 0, (vec).cap * sizeof(char));                       \
+    }
+
+static unsigned serialize(Scanner *scanner, char *buffer) {
+    uint16_t tag_count =
+        scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len;
+    uint16_t serialized_tag_count = 0;
+
+    unsigned size = sizeof(tag_count);
+    memcpy(&buffer[size], &tag_count, sizeof(tag_count));
+    size += sizeof(tag_count);
+
+    for (; serialized_tag_count < tag_count; serialized_tag_count++) {
+        Tag tag = scanner->tags.data[serialized_tag_count];
+        if (tag.type == CUSTOM) {
+            unsigned name_length = tag.custom_tag_name.len;
+            if (name_length > UINT8_MAX) {
+                name_length = UINT8_MAX;
+            }
+            if (size + 2 + name_length >=
+                TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+                break;
+            }
+            buffer[size++] = (char)tag.type;
+            buffer[size++] = (char)name_length;
+            strncpy(&buffer[size], tag.custom_tag_name.data, name_length);
+            size += name_length;
+        } else {
+            if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+                break;
+            }
+            buffer[size++] = (char)tag.type;
+        }
+    }
+
+    memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
+    return size;
+}
+
+static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
+    VEC_CLEAR(scanner->tags);
+    if (length > 0) {
+        unsigned size = 0;
+        uint16_t tag_count = 0;
+        uint16_t serialized_tag_count = 0;
+
+        memcpy(&serialized_tag_count, &buffer[size],
+               sizeof(serialized_tag_count));
+        size += sizeof(serialized_tag_count);
+
+        memcpy(&tag_count, &buffer[size], sizeof(tag_count));
+        size += sizeof(tag_count);
+
+        VEC_RESIZE(scanner->tags, tag_count);
+        if (tag_count > 0) {
+            unsigned iter = 0;
+            for (iter = 0; iter < serialized_tag_count; iter++) {
+                Tag tag = scanner->tags.data[iter];
+                tag.type = (TagType)buffer[size++];
+                if (tag.type == CUSTOM) {
+                    uint16_t name_length = (uint8_t)buffer[size++];
+                    tag.custom_tag_name.len = name_length;
+                    tag.custom_tag_name.cap = name_length;
+                    tag.custom_tag_name.data =
+                        (char *)calloc(1, sizeof(char) * (name_length + 1));
+                    strncpy(tag.custom_tag_name.data, &buffer[size],
+                            name_length);
+                    size += name_length;
+                }
+                VEC_PUSH(scanner->tags, tag);
+            }
+            // add zero tags if we didn't read enough, this is because the
+            // buffer had no more room but we held more tags.
+            for (; iter < tag_count; iter++) {
+                Tag tag = new_tag();
+                VEC_PUSH(scanner->tags, tag);
+            }
+        }
+    }
+}
+
+static String scan_tag_name(TSLexer *lexer) {
+    String tag_name;
+    STRING_INIT(tag_name);
+    while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' ||
+           lexer->lookahead == ':') {
+        STRING_PUSH(tag_name, towupper(lexer->lookahead));
+        lexer->advance(lexer, false);
+    }
+    return tag_name;
+}
+
+static bool scan_comment(TSLexer *lexer) {
+    if (lexer->lookahead != '-') {
+        return false;
+    }
+    lexer->advance(lexer, false);
+    if (lexer->lookahead != '-') {
+        return false;
+    }
+    lexer->advance(lexer, false);
+
+    unsigned dashes = 0;
+    while (lexer->lookahead) {
+        switch (lexer->lookahead) {
+            case '-':
+                ++dashes;
+                break;
+            case '>':
+                if (dashes >= 2) {
+                    lexer->result_symbol = COMMENT;
+                    lexer->advance(lexer, false);
+                    lexer->mark_end(lexer);
+                    return true;
+                }
+            default:
+                dashes = 0;
+        }
+        lexer->advance(lexer, false);
+    }
+    return false;
+}
+
+static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
+    if (scanner->tags.len == 0) {
+        return false;
+    }
+
+    lexer->mark_end(lexer);
+
+    const char *end_delimiter =
+        VEC_BACK(scanner->tags).type == SCRIPT ? "</SCRIPT" : "</STYLE";
+
+    unsigned delimiter_index = 0;
+    while (lexer->lookahead) {
+        if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
+            delimiter_index++;
+            if (delimiter_index == strlen(end_delimiter)) {
+                break;
+            }
+            lexer->advance(lexer, false);
+        } else {
+            delimiter_index = 0;
+            lexer->advance(lexer, false);
+            lexer->mark_end(lexer);
+        }
+    }
+
+    lexer->result_symbol = RAW_TEXT;
+    return true;
+}
+
+static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
+    Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags);
+
+    bool is_closing_tag = false;
+    if (lexer->lookahead == '/') {
+        is_closing_tag = true;
+        lexer->advance(lexer, false);
+    } else {
+        if (parent && is_void(parent)) {
+            VEC_POP(scanner->tags);
+            lexer->result_symbol = IMPLICIT_END_TAG;
+            return true;
+        }
+    }
+
+    String tag_name = scan_tag_name(lexer);
+    if (tag_name.len == 0) {
+        STRING_FREE(tag_name);
+        return false;
+    }
+
+    Tag next_tag = for_name(tag_name.data);
+
+    if (is_closing_tag) {
+        // The tag correctly closes the topmost element on the stack
+        if (scanner->tags.len > 0 &&
+            tagcmp(&VEC_BACK(scanner->tags), &next_tag)) {
+            STRING_FREE(tag_name);
+            tag_free(&next_tag);
+            return false;
+        }
+
+        // Otherwise, dig deeper and queue implicit end tags (to be nice in
+        // the case of malformed HTML)
+        for (unsigned i = scanner->tags.len; i > 0; i--) {
+            if (scanner->tags.data[i - 1].type == next_tag.type) {
+                VEC_POP(scanner->tags);
+                lexer->result_symbol = IMPLICIT_END_TAG;
+                STRING_FREE(tag_name);
+                tag_free(&next_tag);
+                return true;
+            }
+        }
+    } else if (parent && !can_contain(parent, &next_tag)) {
+        VEC_POP(scanner->tags);
+        lexer->result_symbol = IMPLICIT_END_TAG;
+        STRING_FREE(tag_name);
+        tag_free(&next_tag);
+        return true;
+    }
+
+    STRING_FREE(tag_name);
+    tag_free(&next_tag);
+    return false;
+}
+
+static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
+    String tag_name = scan_tag_name(lexer);
+    if (tag_name.len == 0) {
+        STRING_FREE(tag_name);
+        return false;
+    }
+    Tag tag = for_name(tag_name.data);
+    VEC_PUSH(scanner->tags, tag);
+    switch (tag.type) {
+        case SCRIPT:
+            lexer->result_symbol = SCRIPT_START_TAG_NAME;
+            break;
+        case STYLE:
+            lexer->result_symbol = STYLE_START_TAG_NAME;
+            break;
+        default:
+            lexer->result_symbol = START_TAG_NAME;
+            break;
+    }
+    STRING_FREE(tag_name);
+    return true;
+}
+
+static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
+    String tag_name = scan_tag_name(lexer);
+    if (tag_name.len == 0) {
+        STRING_FREE(tag_name);
+        return false;
+    }
+    Tag tag = for_name(tag_name.data);
+    if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) {
+        VEC_POP(scanner->tags);
+        lexer->result_symbol = END_TAG_NAME;
+    } else {
+        lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
+    }
+    tag_free(&tag);
+    STRING_FREE(tag_name);
+    return true;
+}
+
+static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
+    lexer->advance(lexer, false);
+    if (lexer->lookahead == '>') {
+        lexer->advance(lexer, false);
+        if (scanner->tags.len > 0) {
+            VEC_POP(scanner->tags);
+            lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
+        }
+        return true;
+    }
+    return false;
+}
+
+static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
+    if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] &&
+        !valid_symbols[END_TAG_NAME]) {
+        return scan_raw_text(scanner, lexer);
+    }
+
+    while (iswspace(lexer->lookahead)) {
+        lexer->advance(lexer, true);
+    }
+
+    switch (lexer->lookahead) {
+        case '<':
+            lexer->mark_end(lexer);
+            lexer->advance(lexer, false);
+
+            if (lexer->lookahead == '!') {
+                lexer->advance(lexer, false);
+                return scan_comment(lexer);
+            }
+
+            if (valid_symbols[IMPLICIT_END_TAG]) {
+                return scan_implicit_end_tag(scanner, lexer);
+            }
+            break;
+
+        case '\0':
+            if (valid_symbols[IMPLICIT_END_TAG]) {
+                return scan_implicit_end_tag(scanner, lexer);
+            }
+            break;
+
+        case '/':
+            if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
+                return scan_self_closing_tag_delimiter(scanner, lexer);
+            }
+            break;
+
+        default:
+            if ((valid_symbols[START_TAG_NAME] ||
+                 valid_symbols[END_TAG_NAME]) &&
+                !valid_symbols[RAW_TEXT]) {
+                return valid_symbols[START_TAG_NAME]
+                           ? scan_start_tag_name(scanner, lexer)
+                           : scan_end_tag_name(scanner, lexer);
+            }
+    }
+
+    return false;
+}
+
+void *tree_sitter_html_external_scanner_create() {
+    Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
+    return scanner;
+}
+
+bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
+                                            const bool *valid_symbols) {
+    Scanner *scanner = (Scanner *)payload;
+    return scan(scanner, lexer, valid_symbols);
+}
+
+unsigned tree_sitter_html_external_scanner_serialize(void *payload,
+                                                     char *buffer) {
+    Scanner *scanner = (Scanner *)payload;
+    return serialize(scanner, buffer);
+}
+
+void tree_sitter_html_external_scanner_deserialize(void *payload,
+                                                   const char *buffer,
+                                                   unsigned length) {
+    Scanner *scanner = (Scanner *)payload;
+    deserialize(scanner, buffer, length);
+}
+
+void tree_sitter_html_external_scanner_destroy(void *payload) {
+    Scanner *scanner = (Scanner *)payload;
+    for (unsigned i = 0; i < scanner->tags.len; i++) {
+        STRING_FREE(scanner->tags.data[i].custom_tag_name);
+    }
+    VEC_FREE(scanner->tags);
+    free(scanner);
+}
--- a/html/src/tag.h
+++ b/html/src/tag.h
@ -0,0 +1,384 @@
+#include "tree_sitter/parser.h"
+
+#include <assert.h>
+#include <string.h>
+
+typedef enum {
+    AREA,
+    BASE,
+    BASEFONT,
+    BGSOUND,
+    BR,
+    COL,
+    COMMAND,
+    EMBED,
+    FRAME,
+    HR,
+    IMAGE,
+    IMG,
+    INPUT,
+    ISINDEX,
+    KEYGEN,
+    LINK,
+    MENUITEM,
+    META,
+    NEXTID,
+    PARAM,
+    SOURCE,
+    TRACK,
+    WBR,
+    END_OF_VOID_TAGS,
+
+    A,
+    ABBR,
+    ADDRESS,
+    ARTICLE,
+    ASIDE,
+    AUDIO,
+    B,
+    BDI,
+    BDO,
+    BLOCKQUOTE,
+    BODY,
+    BUTTON,
+    CANVAS,
+    CAPTION,
+    CITE,
+    CODE,
+    COLGROUP,
+    DATA,
+    DATALIST,
+    DD,
+    DEL,
+    DETAILS,
+    DFN,
+    DIALOG,
+    DIV,
+    DL,
+    DT,
+    EM,
+    FIELDSET,
+    FIGCAPTION,
+    FIGURE,
+    FOOTER,
+    FORM,
+    H1,
+    H2,
+    H3,
+    H4,
+    H5,
+    H6,
+    HEAD,
+    HEADER,
+    HGROUP,
+    HTML,
+    I,
+    IFRAME,
+    INS,
+    KBD,
+    LABEL,
+    LEGEND,
+    LI,
+    MAIN,
+    MAP,
+    MARK,
+    MATH,
+    MENU,
+    METER,
+    NAV,
+    NOSCRIPT,
+    OBJECT,
+    OL,
+    OPTGROUP,
+    OPTION,
+    OUTPUT,
+    P,
+    PICTURE,
+    PRE,
+    PROGRESS,
+    Q,
+    RB,
+    RP,
+    RT,
+    RTC,
+    RUBY,
+    S,
+    SAMP,
+    SCRIPT,
+    SECTION,
+    SELECT,
+    SLOT,
+    SMALL,
+    SPAN,
+    STRONG,
+    STYLE,
+    SUB,
+    SUMMARY,
+    SUP,
+    SVG,
+    TABLE,
+    TBODY,
+    TD,
+    TEMPLATE,
+    TEXTAREA,
+    TFOOT,
+    TH,
+    THEAD,
+    TIME,
+    TITLE,
+    TR,
+    U,
+    UL,
+    VAR,
+    VIDEO,
+
+    CUSTOM,
+
+    END_,
+} TagType;
+
+typedef struct {
+    uint32_t len;
+    uint32_t cap;
+    char *data;
+} String;
+
+typedef struct {
+    char tag_name[16];
+    TagType tag_value;
+} TagMap;
+
+typedef struct {
+    TagType type;
+    String custom_tag_name;
+} Tag;
+
+const TagMap TAG_TYPES_BY_TAG_NAME[126] = {
+    {"AREA",       AREA      },
+    {"BASE",       BASE      },
+    {"BASEFONT",   BASEFONT  },
+    {"BGSOUND",    BGSOUND   },
+    {"BR",         BR        },
+    {"COL",        COL       },
+    {"COMMAND",    COMMAND   },
+    {"EMBED",      EMBED     },
+    {"FRAME",      FRAME     },
+    {"HR",         HR        },
+    {"IMAGE",      IMAGE     },
+    {"IMG",        IMG       },
+    {"INPUT",      INPUT     },
+    {"ISINDEX",    ISINDEX   },
+    {"KEYGEN",     KEYGEN    },
+    {"LINK",       LINK      },
+    {"MENUITEM",   MENUITEM  },
+    {"META",       META      },
+    {"NEXTID",     NEXTID    },
+    {"PARAM",      PARAM     },
+    {"SOURCE",     SOURCE    },
+    {"TRACK",      TRACK     },
+    {"WBR",        WBR       },
+    {"A",          A         },
+    {"ABBR",       ABBR      },
+    {"ADDRESS",    ADDRESS   },
+    {"ARTICLE",    ARTICLE   },
+    {"ASIDE",      ASIDE     },
+    {"AUDIO",      AUDIO     },
+    {"B",          B         },
+    {"BDI",        BDI       },
+    {"BDO",        BDO       },
+    {"BLOCKQUOTE", BLOCKQUOTE},
+    {"BODY",       BODY      },
+    {"BUTTON",     BUTTON    },
+    {"CANVAS",     CANVAS    },
+    {"CAPTION",    CAPTION   },
+    {"CITE",       CITE      },
+    {"CODE",       CODE      },
+    {"COLGROUP",   COLGROUP  },
+    {"DATA",       DATA      },
+    {"DATALIST",   DATALIST  },
+    {"DD",         DD        },
+    {"DEL",        DEL       },
+    {"DETAILS",    DETAILS   },
+    {"DFN",        DFN       },
+    {"DIALOG",     DIALOG    },
+    {"DIV",        DIV       },
+    {"DL",         DL        },
+    {"DT",         DT        },
+    {"EM",         EM        },
+    {"FIELDSET",   FIELDSET  },
+    {"FIGCAPTION", FIGCAPTION},
+    {"FIGURE",     FIGURE    },
+    {"FOOTER",     FOOTER    },
+    {"FORM",       FORM      },
+    {"H1",         H1        },
+    {"H2",         H2        },
+    {"H3",         H3        },
+    {"H4",         H4        },
+    {"H5",         H5        },
+    {"H6",         H6        },
+    {"HEAD",       HEAD      },
+    {"HEADER",     HEADER    },
+    {"HGROUP",     HGROUP    },
+    {"HTML",       HTML      },
+    {"I",          I         },
+    {"IFRAME",     IFRAME    },
+    {"INS",        INS       },
+    {"KBD",        KBD       },
+    {"LABEL",      LABEL     },
+    {"LEGEND",     LEGEND    },
+    {"LI",         LI        },
+    {"MAIN",       MAIN      },
+    {"MAP",        MAP       },
+    {"MARK",       MARK      },
+    {"MATH",       MATH      },
+    {"MENU",       MENU      },
+    {"METER",      METER     },
+    {"NAV",        NAV       },
+    {"NOSCRIPT",   NOSCRIPT  },
+    {"OBJECT",     OBJECT    },
+    {"OL",         OL        },
+    {"OPTGROUP",   OPTGROUP  },
+    {"OPTION",     OPTION    },
+    {"OUTPUT",     OUTPUT    },
+    {"P",          P         },
+    {"PICTURE",    PICTURE   },
+    {"PRE",        PRE       },
+    {"PROGRESS",   PROGRESS  },
+    {"Q",          Q         },
+    {"RB",         RB        },
+    {"RP",         RP        },
+    {"RT",         RT        },
+    {"RTC",        RTC       },
+    {"RUBY",       RUBY      },
+    {"S",          S         },
+    {"SAMP",       SAMP      },
+    {"SCRIPT",     SCRIPT    },
+    {"SECTION",    SECTION   },
+    {"SELECT",     SELECT    },
+    {"SLOT",       SLOT      },
+    {"SMALL",      SMALL     },
+    {"SPAN",       SPAN      },
+    {"STRONG",     STRONG    },
+    {"STYLE",      STYLE     },
+    {"SUB",        SUB       },
+    {"SUMMARY",    SUMMARY   },
+    {"SUP",        SUP       },
+    {"SVG",        SVG       },
+    {"TABLE",      TABLE     },
+    {"TBODY",      TBODY     },
+    {"TD",         TD        },
+    {"TEMPLATE",   TEMPLATE  },
+    {"TEXTAREA",   TEXTAREA  },
+    {"TFOOT",      TFOOT     },
+    {"TH",         TH        },
+    {"THEAD",      THEAD     },
+    {"TIME",       TIME      },
+    {"TITLE",      TITLE     },
+    {"TR",         TR        },
+    {"U",          U         },
+    {"UL",         UL        },
+    {"VAR",        VAR       },
+    {"VIDEO",      VIDEO     },
+    {"CUSTOM",     CUSTOM    },
+};
+
+static const TagType TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[] = {
+    ADDRESS,  ARTICLE,    ASIDE,  BLOCKQUOTE, DETAILS, DIV, DL,
+    FIELDSET, FIGCAPTION, FIGURE, FOOTER,     FORM,    H1,  H2,
+    H3,       H4,         H5,     H6,         HEADER,  HR,  MAIN,
+    NAV,      OL,         P,      PRE,        SECTION,
+};
+
+static TagType get_tag_from_string(const char *tag_name) {
+    for (int i = 0; i < 126; i++) {
+        if (strcmp(TAG_TYPES_BY_TAG_NAME[i].tag_name, tag_name) == 0) {
+            return TAG_TYPES_BY_TAG_NAME[i].tag_value;
+        }
+    }
+    return CUSTOM;
+}
+
+static inline Tag new_tag() {
+    Tag tag;
+    tag.type = END_;
+    tag.custom_tag_name.data = NULL;
+    tag.custom_tag_name.len = 0;
+    tag.custom_tag_name.cap = 0;
+    return tag;
+}
+
+static Tag make_tag(TagType type, const char *name) {
+    Tag tag = new_tag();
+    tag.type = type;
+    if (type == CUSTOM) {
+        tag.custom_tag_name.len = strlen(name);
+        tag.custom_tag_name.data =
+            (char *)calloc(1, sizeof(char) * (tag.custom_tag_name.len + 1));
+        strncpy(tag.custom_tag_name.data, name, tag.custom_tag_name.len);
+    }
+    return tag;
+}
+
+static inline void tag_free(Tag *tag) {
+    if (tag->type == CUSTOM) {
+        free(tag->custom_tag_name.data);
+    }
+    tag->custom_tag_name.data = NULL;
+}
+
+static inline bool is_void(const Tag *tag) {
+    return tag->type < END_OF_VOID_TAGS;
+}
+
+static inline Tag for_name(const char *name) {
+    return make_tag(get_tag_from_string(name), name);
+}
+
+static inline bool tagcmp(const Tag *_tag1, const Tag *_tag2) {
+    return _tag1->type == _tag2->type &&
+           (_tag1->type == CUSTOM ? strcmp(_tag1->custom_tag_name.data,
+                                           _tag2->custom_tag_name.data) == 0
+                                  : true);
+}
+
+static bool can_contain(Tag *self, const Tag *other) {
+    TagType child = other->type;
+
+    switch (self->type) {
+        case LI:
+            return child != LI;
+
+        case DT:
+        case DD:
+            return child != DT && child != DD;
+
+        case P:
+            for (int i = 0; i < 26; i++) {
+                if (child == TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[i]) {
+                    return false;
+                }
+            }
+            return true;
+
+        case COLGROUP:
+            return child == COL;
+
+        case RB:
+        case RT:
+        case RP:
+            return child != RB && child != RT && child != RP;
+
+        case OPTGROUP:
+            return child != OPTGROUP;
+
+        case TR:
+            return child != TR;
+
+        case TD:
+        case TH:
+            return child != TD && child != TH && child != TR;
+
+        default:
+            return true;
+    }
+}
--- a/html/test/corpus/main.txt
+++ b/html/test/corpus/main.txt
@ -0,0 +1,365 @@
+===================================
+Tags
+===================================
+<span>Hello</span>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (text)
+    (end_tag (tag_name))))
+
+===================================
+Tags with attributes
+===================================
+<input value=yes class="a" data-💩></input>
+---
+
+(fragment
+  (element
+    (start_tag
+      (tag_name)
+      (attribute
+        (attribute_name)
+        (attribute_value))
+      (attribute
+        (attribute_name)
+        (quoted_attribute_value (attribute_value)))
+      (attribute
+        (attribute_name)))
+    (end_tag (tag_name))))
+
+===================================
+Nested tags
+===================================
+<div>
+  <span>a</span>
+  b
+  <b>c</b>
+  Multi-line
+  text
+</div>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element
+      (start_tag (tag_name))
+      (text)
+      (end_tag (tag_name)))
+    (text)
+    (element
+      (start_tag (tag_name))
+      (text)
+      (end_tag (tag_name)))
+    (text)
+    (end_tag (tag_name))))
+
+==================================
+Void tags
+==================================
+<form><img src="something.png"><br><input type=submit value=Ok /></form>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element
+      (start_tag
+        (tag_name)
+        (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
+    (element (start_tag (tag_name)))
+    (element
+      (self_closing_tag
+        (tag_name)
+        (attribute (attribute_name) (attribute_value))
+        (attribute (attribute_name) (attribute_value))))
+    (end_tag (tag_name))))
+
+==================================
+Void tags at EOF
+==================================
+<img src="something.png">
+---
+
+(fragment
+  (element
+    (start_tag
+      (tag_name)
+      (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
+
+==================================
+Custom tags
+==================================
+<something:different>
+  <atom-text-editor mini>
+    Hello
+  </atom-text-editor>
+</something:different>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element
+      (start_tag (tag_name) (attribute (attribute_name)))
+      (text)
+      (end_tag (tag_name)))
+    (end_tag (tag_name))))
+
+==================================
+Comments
+==================================
+<!-- hello -->
+<!-- world ->-> -- > ->->->-- -> still comment -->
+<div>
+  <!-- <span>something</span> -->
+</div>
+---
+
+(fragment
+  (comment)
+  (comment)
+  (element
+    (start_tag (tag_name))
+    (comment)
+    (end_tag (tag_name))))
+
+==================================
+Raw text elements
+==================================
+<script>
+  </s
+  </sc
+  </scr
+  </scri
+  </scrip
+</script>
+
+<style>
+  </ </s </st </sty </styl
+</style>
+
+<script>
+</SCRIPT>
+
+---
+
+(fragment
+  (script_element
+    (start_tag (tag_name))
+    (raw_text)
+    (end_tag (tag_name)))
+  (style_element
+    (start_tag (tag_name))
+    (raw_text)
+    (end_tag (tag_name)))
+  (script_element
+    (start_tag (tag_name))
+    (raw_text)
+    (end_tag (tag_name))))
+
+==================================
+All-caps doctype
+==================================
+<!DOCTYPE html PUBLIC
+  "-//W3C//DTD XHTML 1.0 Transitional//EN"
+  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+---
+
+(fragment
+  (doctype))
+
+==================================
+Lowercase doctype
+==================================
+<!doctype html>
+---
+
+(fragment
+  (doctype))
+
+==================================
+LI elements without close tags
+==================================
+<ul>
+  <li>One
+  <li>Two
+</ul>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))
+
+======================================
+DT and DL elements without close tags
+======================================
+<dl>
+  <dt>Coffee
+  <dt>Café
+  <dd>Black hot drink
+  <dt>Milk
+  <dd>White cold drink
+</dl>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))
+
+======================================
+P elements without close tags
+======================================
+<p>One
+<div>Two</div>
+<p>Three
+<p>Four
+<h1>Five</h1>
+---
+
+(fragment
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text))
+  (element (start_tag (tag_name)) (text) (end_tag (tag_name))))
+
+======================================
+Ruby annotation elements without close tags
+======================================
+<ruby>東<rb>京<rt>とう<rt>きょう</ruby>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (text)
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (element (start_tag (tag_name)) (text))
+    (end_tag (tag_name))))
+
+=======================================
+COLGROUP elements without end tags
+=======================================
+<table>
+  <colgroup>
+    <col style="background-color: #0f0">
+    <col span="2">
+  <tr>
+    <th>Lime</th>
+    <th>Lemon</th>
+    <th>Orange</th>
+  </tr>
+</table>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element
+      (start_tag (tag_name))
+      (element (start_tag
+        (tag_name)
+        (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
+      (element (start_tag
+        (tag_name)
+        (attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
+    (element
+      (start_tag (tag_name))
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (element (start_tag (tag_name)) (text) (end_tag (tag_name)))
+      (end_tag (tag_name)))
+    (end_tag (tag_name))))
+
+=========================================
+TR, TD, and TH elements without end tags
+=========================================
+<table>
+  <tr>
+    <th>One
+    <th>Two
+  <tr>
+    <td>Three
+    <td>Four
+</table>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (element
+      (start_tag (tag_name))
+      (element (start_tag (tag_name)) (text))
+      (element (start_tag (tag_name)) (text)))
+    (element
+      (start_tag (tag_name))
+      (element (start_tag (tag_name)) (text))
+      (element (start_tag (tag_name)) (text)))
+    (end_tag (tag_name))))
+
+==============================
+Named entities in tag contents
+==============================
+
+<p>Lorem ipsum &nbsp; dolor sit &copy; amet.</p>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (text)
+    (entity)
+    (text)
+    (entity)
+    (text)
+    (end_tag (tag_name))))
+
+================================
+Numeric entities in tag contents
+================================
+
+<p>Lorem ipsum &#160; dolor sit &#8212; amet.</p>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (text)
+    (entity)
+    (text)
+    (entity)
+    (text)
+    (end_tag (tag_name))))
+
+=================================
+Multiple entities in tag contents
+=================================
+
+<p>Lorem ipsum &#xA0; dolor &#xa0; sit &nbsp; amet.</p>
+---
+
+(fragment
+  (element
+    (start_tag (tag_name))
+    (text)
+    (entity)
+    (text)
+    (entity)
+    (text)
+    (entity)
+    (text)
+    (end_tag (tag_name))))
--- a/html/test/highlight/attributes.html
+++ b/html/test/highlight/attributes.html
@ -0,0 +1,39 @@
+<div style="display: flex" draggable>
+  <!-- <- tag -->
+  <!--^^^^ attribute -->
+  <!--      ^^^^^^^^^^^^^ string -->
+  <!--                     ^^^^^^^^^ attribute -->
+  <div onclick=tap>Hello, World</div>
+  <!-- <- punctuation.bracket -->
+  <div onclick=tap>Hello, World</div>
+  <!-- ^^^^^^^^ attribute -->
+  <div onclick=tap>Hello, World</div>
+  <!--         ^^^ string -->
+  <div onclick=tap>Hello, World</div>
+  <!--                         ^^ punctuation.bracket -->
+  <div onclick="tap">Hello, World</div>
+  <!--                             ^^^ tag -->
+  <div onclick="tap">Hello, World</div>
+  <!--                                ^ punctuation.bracket -->
+  <something:different
+  <!-- <- punctuation.bracket -->
+  <!-- ^^^^^^^^^^^^^^^ tag -->
+    @click="count++"
+<!--^^^^^^^ attribute -->
+<!--        ^^^^^^^ string -->
+    :value="count"
+<!--^^^^^^^ attribute -->
+<!--        ^^^^^ string -->
+    @value:modelValue="newValue => count = newValue"
+<!--^^^^^^^^^^^^^^^^^^ attribute -->
+<!--                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ string -->
+  >
+  <!-- <- punctuation.bracket -->
+  </something:different>
+  <!-- <- punctuation.bracket -->
+  <!-- ^^^^^^^^^^^^^^^^ tag -->
+  <!--                 ^ punctuation.bracket -->
+</div>
+<!-- <- punctuation.bracket -->
+<!--^ tag -->
+<!-- ^ punctuation.bracket -->
--- a/html/test/highlight/doctype.html
+++ b/html/test/highlight/doctype.html
@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!-- ^^^^^^^^^^^^^^^^^^^^ constant -->
+
+<!DOCTYPE html>
+<!-- ^^^^ constant -->
+<!--          ^ punctuation.bracket -->
--- a/html/test/highlight/erroneous.html
+++ b/html/test/highlight/erroneous.html
@ -0,0 +1,7 @@
+<span>
+<!-- <- punctuation.bracket -->
+<!--^ tag -->
+
+</div>
+<!--^ tag.error -->
+<!-- ^ punctuation.bracket -->
--- a/html/test/highlight/self-closing.html
+++ b/html/test/highlight/self-closing.html
@ -0,0 +1,15 @@
+<input />
+<!-- <- punctuation.bracket -->
+<!-- ^ tag -->
+
+<input type="submit" readonly />
+<!--   ^^^^ attribute -->
+
+<input type="submit" readonly />
+<!--         ^^^^^^ string -->
+
+<input type="submit" readonly />
+<!--                 ^^^^^^^^ attribute -->
+
+<input type="submit" readonly />
+<!--                          ^^ punctuation.bracket -->