From 3088bcfa224e6f259428054571529b09587af899 Mon Sep 17 00:00:00 2001 From: xenia Date: Mon, 11 Dec 2023 19:48:31 +0100 Subject: [PATCH] Add HTML --- README.md | 1 + flake.nix | 1 + html/LICENSE | 21 ++ html/grammar.js | 143 ++++++++ html/src/scanner.c | 452 ++++++++++++++++++++++++++ html/src/tag.h | 384 ++++++++++++++++++++++ html/test/corpus/main.txt | 365 +++++++++++++++++++++ html/test/highlight/attributes.html | 39 +++ html/test/highlight/doctype.html | 6 + html/test/highlight/erroneous.html | 7 + html/test/highlight/self-closing.html | 15 + 11 files changed, 1434 insertions(+) create mode 100644 html/LICENSE create mode 100644 html/grammar.js create mode 100644 html/src/scanner.c create mode 100644 html/src/tag.h create mode 100644 html/test/corpus/main.txt create mode 100644 html/test/highlight/attributes.html create mode 100644 html/test/highlight/doctype.html create mode 100644 html/test/highlight/erroneous.html create mode 100644 html/test/highlight/self-closing.html diff --git a/README.md b/README.md index c4e2f4a..ec2e29f 100644 --- a/README.md +++ b/README.md @@ -7,3 +7,4 @@ Languages: * nix: https://github.com/nix-community/tree-sitter-nix (MIT) * python: https://github.com/tree-sitter/tree-sitter-python (MIT) * rust: https://github.com/tree-sitter/tree-sitter-rust (MIT) +* html: https://github.com/tree-sitter/tree-sitter-html (MIT) diff --git a/flake.nix b/flake.nix index ac7b322..f98185c 100644 --- a/flake.nix +++ b/flake.nix @@ -44,6 +44,7 @@ packages.nix = compile-tree-sitter { src = ./nix; name = "nix"; }; packages.python = compile-tree-sitter { src = ./python; name = "python"; }; packages.rust = compile-tree-sitter { src = ./rust; name = "rust"; }; + packages.html = compile-tree-sitter { src = ./html; name = "html"; }; } ); } diff --git a/html/LICENSE b/html/LICENSE new file mode 100644 index 0000000..4b52d19 --- /dev/null +++ b/html/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Max Brunsfeld + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/html/grammar.js b/html/grammar.js new file mode 100644 index 0000000..3f76f1e --- /dev/null +++ b/html/grammar.js @@ -0,0 +1,143 @@ +/** + * @file HTML grammar for tree-sitter + * @author Max Brunsfeld + * @license MIT + */ + +/* eslint-disable arrow-parens */ +/* eslint-disable camelcase */ +/* eslint-disable-next-line spaced-comment */ +/// +// @ts-check + +module.exports = grammar({ + name: 'html', + + extras: $ => [ + $.comment, + /\s+/, + ], + + externals: $ => [ + $._start_tag_name, + $._script_start_tag_name, + $._style_start_tag_name, + $._end_tag_name, + $.erroneous_end_tag_name, + '/>', + $._implicit_end_tag, + $.raw_text, + $.comment, + ], + + rules: { + fragment: $ => repeat($._node), + + doctype: $ => seq( + ']+/, + '>', + ), + + _doctype: _ => /[Dd][Oo][Cc][Tt][Yy][Pp][Ee]/, + + _node: $ => choice( + $.doctype, + $.entity, + $.text, + $.element, + $.script_element, + $.style_element, + $.erroneous_end_tag, + ), + + element: $ => choice( + seq( + $.start_tag, + repeat($._node), + choice($.end_tag, $._implicit_end_tag), + ), + $.self_closing_tag, + ), + + script_element: $ => seq( + alias($.script_start_tag, $.start_tag), + optional($.raw_text), + $.end_tag, + ), + + style_element: $ => seq( + alias($.style_start_tag, $.start_tag), + optional($.raw_text), + $.end_tag, + ), + + start_tag: $ => seq( + '<', + alias($._start_tag_name, $.tag_name), + repeat($.attribute), + '>', + ), + + script_start_tag: $ => seq( + '<', + alias($._script_start_tag_name, $.tag_name), + repeat($.attribute), + '>', + ), + + style_start_tag: $ => seq( + '<', + alias($._style_start_tag_name, $.tag_name), + repeat($.attribute), + '>', + ), + + self_closing_tag: $ => seq( + '<', + alias($._start_tag_name, $.tag_name), + repeat($.attribute), + '/>', + ), + + end_tag: $ => seq( + '', + ), + + erroneous_end_tag: $ => seq( + '', + ), + + attribute: $ => seq( + $.attribute_name, + optional(seq( + '=', + choice( + $.attribute_value, + $.quoted_attribute_value, + ), + )), + ), + + attribute_name: _ => /[^<>"'/=\s]+/, + + attribute_value: _ => /[^<>"'=\s]+/, + + // An entity can be named, numeric (decimal), or numeric (hexacecimal). The + // longest entity name is 29 characters long, and the HTML spec says that + // no more will ever be added. + entity: _ => /&(#([xX][0-9a-fA-F]{1,6}|[0-9]{1,5})|[A-Za-z]{1,30});/, + + quoted_attribute_value: $ => choice( + seq('\'', optional(alias(/[^']+/, $.attribute_value)), '\''), + seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"'), + ), + + text: _ => /[^<>&\s]([^<>&]*[^<>&\s])?/, + }, +}); diff --git a/html/src/scanner.c b/html/src/scanner.c new file mode 100644 index 0000000..9ec2915 --- /dev/null +++ b/html/src/scanner.c @@ -0,0 +1,452 @@ +#include "tag.h" + +#include + +enum TokenType { + START_TAG_NAME, + SCRIPT_START_TAG_NAME, + STYLE_START_TAG_NAME, + END_TAG_NAME, + ERRONEOUS_END_TAG_NAME, + SELF_CLOSING_TAG_DELIMITER, + IMPLICIT_END_TAG, + RAW_TEXT, + COMMENT +}; + +typedef struct { + uint32_t len; + uint32_t cap; + Tag *data; +} tags_vec; + +typedef struct { + tags_vec tags; +} Scanner; + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + if ((_cap) > (vec).cap && (_cap) > 0) { \ + void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + (vec).cap = (_cap); \ + } + +#define VEC_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + VEC_RESIZE((vec), (_cap)); \ + } + +#define VEC_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define VEC_POP(vec) \ + { \ + if (VEC_BACK(vec).type == CUSTOM) { \ + tag_free(&VEC_BACK(vec)); \ + } \ + (vec).len--; \ + } + +#define VEC_BACK(vec) ((vec).data[(vec).len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + (vec).data = NULL; \ + } + +#define VEC_CLEAR(vec) \ + { \ + for (int i = 0; i < (vec).len; i++) { \ + tag_free(&(vec).data[i]); \ + } \ + (vec).len = 0; \ + } + +#define STRING_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + memset((vec).data + (vec).len, 0, \ + (((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \ + (vec).cap = (_cap); + +#define STRING_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + STRING_RESIZE((vec), (_cap)); \ + } + +#define STRING_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define STRING_INIT(vec) \ + { \ + (vec).data = calloc(1, sizeof(char) * 17); \ + (vec).len = 0; \ + (vec).cap = 16; \ + } + +#define STRING_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + (vec).data = NULL; \ + } + +#define STRING_CLEAR(vec) \ + { \ + (vec).len = 0; \ + memset((vec).data, 0, (vec).cap * sizeof(char)); \ + } + +static unsigned serialize(Scanner *scanner, char *buffer) { + uint16_t tag_count = + scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len; + uint16_t serialized_tag_count = 0; + + unsigned size = sizeof(tag_count); + memcpy(&buffer[size], &tag_count, sizeof(tag_count)); + size += sizeof(tag_count); + + for (; serialized_tag_count < tag_count; serialized_tag_count++) { + Tag tag = scanner->tags.data[serialized_tag_count]; + if (tag.type == CUSTOM) { + unsigned name_length = tag.custom_tag_name.len; + if (name_length > UINT8_MAX) { + name_length = UINT8_MAX; + } + if (size + 2 + name_length >= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + break; + } + buffer[size++] = (char)tag.type; + buffer[size++] = (char)name_length; + strncpy(&buffer[size], tag.custom_tag_name.data, name_length); + size += name_length; + } else { + if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + break; + } + buffer[size++] = (char)tag.type; + } + } + + memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count)); + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + VEC_CLEAR(scanner->tags); + if (length > 0) { + unsigned size = 0; + uint16_t tag_count = 0; + uint16_t serialized_tag_count = 0; + + memcpy(&serialized_tag_count, &buffer[size], + sizeof(serialized_tag_count)); + size += sizeof(serialized_tag_count); + + memcpy(&tag_count, &buffer[size], sizeof(tag_count)); + size += sizeof(tag_count); + + VEC_RESIZE(scanner->tags, tag_count); + if (tag_count > 0) { + unsigned iter = 0; + for (iter = 0; iter < serialized_tag_count; iter++) { + Tag tag = scanner->tags.data[iter]; + tag.type = (TagType)buffer[size++]; + if (tag.type == CUSTOM) { + uint16_t name_length = (uint8_t)buffer[size++]; + tag.custom_tag_name.len = name_length; + tag.custom_tag_name.cap = name_length; + tag.custom_tag_name.data = + (char *)calloc(1, sizeof(char) * (name_length + 1)); + strncpy(tag.custom_tag_name.data, &buffer[size], + name_length); + size += name_length; + } + VEC_PUSH(scanner->tags, tag); + } + // add zero tags if we didn't read enough, this is because the + // buffer had no more room but we held more tags. + for (; iter < tag_count; iter++) { + Tag tag = new_tag(); + VEC_PUSH(scanner->tags, tag); + } + } + } +} + +static String scan_tag_name(TSLexer *lexer) { + String tag_name; + STRING_INIT(tag_name); + while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || + lexer->lookahead == ':') { + STRING_PUSH(tag_name, towupper(lexer->lookahead)); + lexer->advance(lexer, false); + } + return tag_name; +} + +static bool scan_comment(TSLexer *lexer) { + if (lexer->lookahead != '-') { + return false; + } + lexer->advance(lexer, false); + if (lexer->lookahead != '-') { + return false; + } + lexer->advance(lexer, false); + + unsigned dashes = 0; + while (lexer->lookahead) { + switch (lexer->lookahead) { + case '-': + ++dashes; + break; + case '>': + if (dashes >= 2) { + lexer->result_symbol = COMMENT; + lexer->advance(lexer, false); + lexer->mark_end(lexer); + return true; + } + default: + dashes = 0; + } + lexer->advance(lexer, false); + } + return false; +} + +static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) { + if (scanner->tags.len == 0) { + return false; + } + + lexer->mark_end(lexer); + + const char *end_delimiter = + VEC_BACK(scanner->tags).type == SCRIPT ? "lookahead) { + if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) { + delimiter_index++; + if (delimiter_index == strlen(end_delimiter)) { + break; + } + lexer->advance(lexer, false); + } else { + delimiter_index = 0; + lexer->advance(lexer, false); + lexer->mark_end(lexer); + } + } + + lexer->result_symbol = RAW_TEXT; + return true; +} + +static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) { + Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags); + + bool is_closing_tag = false; + if (lexer->lookahead == '/') { + is_closing_tag = true; + lexer->advance(lexer, false); + } else { + if (parent && is_void(parent)) { + VEC_POP(scanner->tags); + lexer->result_symbol = IMPLICIT_END_TAG; + return true; + } + } + + String tag_name = scan_tag_name(lexer); + if (tag_name.len == 0) { + STRING_FREE(tag_name); + return false; + } + + Tag next_tag = for_name(tag_name.data); + + if (is_closing_tag) { + // The tag correctly closes the topmost element on the stack + if (scanner->tags.len > 0 && + tagcmp(&VEC_BACK(scanner->tags), &next_tag)) { + STRING_FREE(tag_name); + tag_free(&next_tag); + return false; + } + + // Otherwise, dig deeper and queue implicit end tags (to be nice in + // the case of malformed HTML) + for (unsigned i = scanner->tags.len; i > 0; i--) { + if (scanner->tags.data[i - 1].type == next_tag.type) { + VEC_POP(scanner->tags); + lexer->result_symbol = IMPLICIT_END_TAG; + STRING_FREE(tag_name); + tag_free(&next_tag); + return true; + } + } + } else if (parent && !can_contain(parent, &next_tag)) { + VEC_POP(scanner->tags); + lexer->result_symbol = IMPLICIT_END_TAG; + STRING_FREE(tag_name); + tag_free(&next_tag); + return true; + } + + STRING_FREE(tag_name); + tag_free(&next_tag); + return false; +} + +static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) { + String tag_name = scan_tag_name(lexer); + if (tag_name.len == 0) { + STRING_FREE(tag_name); + return false; + } + Tag tag = for_name(tag_name.data); + VEC_PUSH(scanner->tags, tag); + switch (tag.type) { + case SCRIPT: + lexer->result_symbol = SCRIPT_START_TAG_NAME; + break; + case STYLE: + lexer->result_symbol = STYLE_START_TAG_NAME; + break; + default: + lexer->result_symbol = START_TAG_NAME; + break; + } + STRING_FREE(tag_name); + return true; +} + +static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) { + String tag_name = scan_tag_name(lexer); + if (tag_name.len == 0) { + STRING_FREE(tag_name); + return false; + } + Tag tag = for_name(tag_name.data); + if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) { + VEC_POP(scanner->tags); + lexer->result_symbol = END_TAG_NAME; + } else { + lexer->result_symbol = ERRONEOUS_END_TAG_NAME; + } + tag_free(&tag); + STRING_FREE(tag_name); + return true; +} + +static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) { + lexer->advance(lexer, false); + if (lexer->lookahead == '>') { + lexer->advance(lexer, false); + if (scanner->tags.len > 0) { + VEC_POP(scanner->tags); + lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER; + } + return true; + } + return false; +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && + !valid_symbols[END_TAG_NAME]) { + return scan_raw_text(scanner, lexer); + } + + while (iswspace(lexer->lookahead)) { + lexer->advance(lexer, true); + } + + switch (lexer->lookahead) { + case '<': + lexer->mark_end(lexer); + lexer->advance(lexer, false); + + if (lexer->lookahead == '!') { + lexer->advance(lexer, false); + return scan_comment(lexer); + } + + if (valid_symbols[IMPLICIT_END_TAG]) { + return scan_implicit_end_tag(scanner, lexer); + } + break; + + case '\0': + if (valid_symbols[IMPLICIT_END_TAG]) { + return scan_implicit_end_tag(scanner, lexer); + } + break; + + case '/': + if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) { + return scan_self_closing_tag_delimiter(scanner, lexer); + } + break; + + default: + if ((valid_symbols[START_TAG_NAME] || + valid_symbols[END_TAG_NAME]) && + !valid_symbols[RAW_TEXT]) { + return valid_symbols[START_TAG_NAME] + ? scan_start_tag_name(scanner, lexer) + : scan_end_tag_name(scanner, lexer); + } + } + + return false; +} + +void *tree_sitter_html_external_scanner_create() { + Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); + return scanner; +} + +bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +unsigned tree_sitter_html_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_html_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +void tree_sitter_html_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + for (unsigned i = 0; i < scanner->tags.len; i++) { + STRING_FREE(scanner->tags.data[i].custom_tag_name); + } + VEC_FREE(scanner->tags); + free(scanner); +} diff --git a/html/src/tag.h b/html/src/tag.h new file mode 100644 index 0000000..f20591a --- /dev/null +++ b/html/src/tag.h @@ -0,0 +1,384 @@ +#include "tree_sitter/parser.h" + +#include +#include + +typedef enum { + AREA, + BASE, + BASEFONT, + BGSOUND, + BR, + COL, + COMMAND, + EMBED, + FRAME, + HR, + IMAGE, + IMG, + INPUT, + ISINDEX, + KEYGEN, + LINK, + MENUITEM, + META, + NEXTID, + PARAM, + SOURCE, + TRACK, + WBR, + END_OF_VOID_TAGS, + + A, + ABBR, + ADDRESS, + ARTICLE, + ASIDE, + AUDIO, + B, + BDI, + BDO, + BLOCKQUOTE, + BODY, + BUTTON, + CANVAS, + CAPTION, + CITE, + CODE, + COLGROUP, + DATA, + DATALIST, + DD, + DEL, + DETAILS, + DFN, + DIALOG, + DIV, + DL, + DT, + EM, + FIELDSET, + FIGCAPTION, + FIGURE, + FOOTER, + FORM, + H1, + H2, + H3, + H4, + H5, + H6, + HEAD, + HEADER, + HGROUP, + HTML, + I, + IFRAME, + INS, + KBD, + LABEL, + LEGEND, + LI, + MAIN, + MAP, + MARK, + MATH, + MENU, + METER, + NAV, + NOSCRIPT, + OBJECT, + OL, + OPTGROUP, + OPTION, + OUTPUT, + P, + PICTURE, + PRE, + PROGRESS, + Q, + RB, + RP, + RT, + RTC, + RUBY, + S, + SAMP, + SCRIPT, + SECTION, + SELECT, + SLOT, + SMALL, + SPAN, + STRONG, + STYLE, + SUB, + SUMMARY, + SUP, + SVG, + TABLE, + TBODY, + TD, + TEMPLATE, + TEXTAREA, + TFOOT, + TH, + THEAD, + TIME, + TITLE, + TR, + U, + UL, + VAR, + VIDEO, + + CUSTOM, + + END_, +} TagType; + +typedef struct { + uint32_t len; + uint32_t cap; + char *data; +} String; + +typedef struct { + char tag_name[16]; + TagType tag_value; +} TagMap; + +typedef struct { + TagType type; + String custom_tag_name; +} Tag; + +const TagMap TAG_TYPES_BY_TAG_NAME[126] = { + {"AREA", AREA }, + {"BASE", BASE }, + {"BASEFONT", BASEFONT }, + {"BGSOUND", BGSOUND }, + {"BR", BR }, + {"COL", COL }, + {"COMMAND", COMMAND }, + {"EMBED", EMBED }, + {"FRAME", FRAME }, + {"HR", HR }, + {"IMAGE", IMAGE }, + {"IMG", IMG }, + {"INPUT", INPUT }, + {"ISINDEX", ISINDEX }, + {"KEYGEN", KEYGEN }, + {"LINK", LINK }, + {"MENUITEM", MENUITEM }, + {"META", META }, + {"NEXTID", NEXTID }, + {"PARAM", PARAM }, + {"SOURCE", SOURCE }, + {"TRACK", TRACK }, + {"WBR", WBR }, + {"A", A }, + {"ABBR", ABBR }, + {"ADDRESS", ADDRESS }, + {"ARTICLE", ARTICLE }, + {"ASIDE", ASIDE }, + {"AUDIO", AUDIO }, + {"B", B }, + {"BDI", BDI }, + {"BDO", BDO }, + {"BLOCKQUOTE", BLOCKQUOTE}, + {"BODY", BODY }, + {"BUTTON", BUTTON }, + {"CANVAS", CANVAS }, + {"CAPTION", CAPTION }, + {"CITE", CITE }, + {"CODE", CODE }, + {"COLGROUP", COLGROUP }, + {"DATA", DATA }, + {"DATALIST", DATALIST }, + {"DD", DD }, + {"DEL", DEL }, + {"DETAILS", DETAILS }, + {"DFN", DFN }, + {"DIALOG", DIALOG }, + {"DIV", DIV }, + {"DL", DL }, + {"DT", DT }, + {"EM", EM }, + {"FIELDSET", FIELDSET }, + {"FIGCAPTION", FIGCAPTION}, + {"FIGURE", FIGURE }, + {"FOOTER", FOOTER }, + {"FORM", FORM }, + {"H1", H1 }, + {"H2", H2 }, + {"H3", H3 }, + {"H4", H4 }, + {"H5", H5 }, + {"H6", H6 }, + {"HEAD", HEAD }, + {"HEADER", HEADER }, + {"HGROUP", HGROUP }, + {"HTML", HTML }, + {"I", I }, + {"IFRAME", IFRAME }, + {"INS", INS }, + {"KBD", KBD }, + {"LABEL", LABEL }, + {"LEGEND", LEGEND }, + {"LI", LI }, + {"MAIN", MAIN }, + {"MAP", MAP }, + {"MARK", MARK }, + {"MATH", MATH }, + {"MENU", MENU }, + {"METER", METER }, + {"NAV", NAV }, + {"NOSCRIPT", NOSCRIPT }, + {"OBJECT", OBJECT }, + {"OL", OL }, + {"OPTGROUP", OPTGROUP }, + {"OPTION", OPTION }, + {"OUTPUT", OUTPUT }, + {"P", P }, + {"PICTURE", PICTURE }, + {"PRE", PRE }, + {"PROGRESS", PROGRESS }, + {"Q", Q }, + {"RB", RB }, + {"RP", RP }, + {"RT", RT }, + {"RTC", RTC }, + {"RUBY", RUBY }, + {"S", S }, + {"SAMP", SAMP }, + {"SCRIPT", SCRIPT }, + {"SECTION", SECTION }, + {"SELECT", SELECT }, + {"SLOT", SLOT }, + {"SMALL", SMALL }, + {"SPAN", SPAN }, + {"STRONG", STRONG }, + {"STYLE", STYLE }, + {"SUB", SUB }, + {"SUMMARY", SUMMARY }, + {"SUP", SUP }, + {"SVG", SVG }, + {"TABLE", TABLE }, + {"TBODY", TBODY }, + {"TD", TD }, + {"TEMPLATE", TEMPLATE }, + {"TEXTAREA", TEXTAREA }, + {"TFOOT", TFOOT }, + {"TH", TH }, + {"THEAD", THEAD }, + {"TIME", TIME }, + {"TITLE", TITLE }, + {"TR", TR }, + {"U", U }, + {"UL", UL }, + {"VAR", VAR }, + {"VIDEO", VIDEO }, + {"CUSTOM", CUSTOM }, +}; + +static const TagType TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[] = { + ADDRESS, ARTICLE, ASIDE, BLOCKQUOTE, DETAILS, DIV, DL, + FIELDSET, FIGCAPTION, FIGURE, FOOTER, FORM, H1, H2, + H3, H4, H5, H6, HEADER, HR, MAIN, + NAV, OL, P, PRE, SECTION, +}; + +static TagType get_tag_from_string(const char *tag_name) { + for (int i = 0; i < 126; i++) { + if (strcmp(TAG_TYPES_BY_TAG_NAME[i].tag_name, tag_name) == 0) { + return TAG_TYPES_BY_TAG_NAME[i].tag_value; + } + } + return CUSTOM; +} + +static inline Tag new_tag() { + Tag tag; + tag.type = END_; + tag.custom_tag_name.data = NULL; + tag.custom_tag_name.len = 0; + tag.custom_tag_name.cap = 0; + return tag; +} + +static Tag make_tag(TagType type, const char *name) { + Tag tag = new_tag(); + tag.type = type; + if (type == CUSTOM) { + tag.custom_tag_name.len = strlen(name); + tag.custom_tag_name.data = + (char *)calloc(1, sizeof(char) * (tag.custom_tag_name.len + 1)); + strncpy(tag.custom_tag_name.data, name, tag.custom_tag_name.len); + } + return tag; +} + +static inline void tag_free(Tag *tag) { + if (tag->type == CUSTOM) { + free(tag->custom_tag_name.data); + } + tag->custom_tag_name.data = NULL; +} + +static inline bool is_void(const Tag *tag) { + return tag->type < END_OF_VOID_TAGS; +} + +static inline Tag for_name(const char *name) { + return make_tag(get_tag_from_string(name), name); +} + +static inline bool tagcmp(const Tag *_tag1, const Tag *_tag2) { + return _tag1->type == _tag2->type && + (_tag1->type == CUSTOM ? strcmp(_tag1->custom_tag_name.data, + _tag2->custom_tag_name.data) == 0 + : true); +} + +static bool can_contain(Tag *self, const Tag *other) { + TagType child = other->type; + + switch (self->type) { + case LI: + return child != LI; + + case DT: + case DD: + return child != DT && child != DD; + + case P: + for (int i = 0; i < 26; i++) { + if (child == TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[i]) { + return false; + } + } + return true; + + case COLGROUP: + return child == COL; + + case RB: + case RT: + case RP: + return child != RB && child != RT && child != RP; + + case OPTGROUP: + return child != OPTGROUP; + + case TR: + return child != TR; + + case TD: + case TH: + return child != TD && child != TH && child != TR; + + default: + return true; + } +} diff --git a/html/test/corpus/main.txt b/html/test/corpus/main.txt new file mode 100644 index 0000000..44f66fe --- /dev/null +++ b/html/test/corpus/main.txt @@ -0,0 +1,365 @@ +=================================== +Tags +=================================== +Hello +--- + +(fragment + (element + (start_tag (tag_name)) + (text) + (end_tag (tag_name)))) + +=================================== +Tags with attributes +=================================== + +--- + +(fragment + (element + (start_tag + (tag_name) + (attribute + (attribute_name) + (attribute_value)) + (attribute + (attribute_name) + (quoted_attribute_value (attribute_value))) + (attribute + (attribute_name))) + (end_tag (tag_name)))) + +=================================== +Nested tags +=================================== +
+ a + b + c + Multi-line + text +
+--- + +(fragment + (element + (start_tag (tag_name)) + (element + (start_tag (tag_name)) + (text) + (end_tag (tag_name))) + (text) + (element + (start_tag (tag_name)) + (text) + (end_tag (tag_name))) + (text) + (end_tag (tag_name)))) + +================================== +Void tags +================================== +

+--- + +(fragment + (element + (start_tag (tag_name)) + (element + (start_tag + (tag_name) + (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) + (element (start_tag (tag_name))) + (element + (self_closing_tag + (tag_name) + (attribute (attribute_name) (attribute_value)) + (attribute (attribute_name) (attribute_value)))) + (end_tag (tag_name)))) + +================================== +Void tags at EOF +================================== + +--- + +(fragment + (element + (start_tag + (tag_name) + (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))) + +================================== +Custom tags +================================== + + + Hello + + +--- + +(fragment + (element + (start_tag (tag_name)) + (element + (start_tag (tag_name) (attribute (attribute_name))) + (text) + (end_tag (tag_name))) + (end_tag (tag_name)))) + +================================== +Comments +================================== + + +
+ +
+--- + +(fragment + (comment) + (comment) + (element + (start_tag (tag_name)) + (comment) + (end_tag (tag_name)))) + +================================== +Raw text elements +================================== + + + + + + +--- + +(fragment + (script_element + (start_tag (tag_name)) + (raw_text) + (end_tag (tag_name))) + (style_element + (start_tag (tag_name)) + (raw_text) + (end_tag (tag_name))) + (script_element + (start_tag (tag_name)) + (raw_text) + (end_tag (tag_name)))) + +================================== +All-caps doctype +================================== + +--- + +(fragment + (doctype)) + +================================== +Lowercase doctype +================================== + +--- + +(fragment + (doctype)) + +================================== +LI elements without close tags +================================== +
    +
  • One +
  • Two +
+--- + +(fragment + (element + (start_tag (tag_name)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (end_tag (tag_name)))) + +====================================== +DT and DL elements without close tags +====================================== +
+
Coffee +
Café +
Black hot drink +
Milk +
White cold drink +
+--- + +(fragment + (element + (start_tag (tag_name)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (end_tag (tag_name)))) + +====================================== +P elements without close tags +====================================== +

One +

Two
+

Three +

Four +

Five

+--- + +(fragment + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text) (end_tag (tag_name))) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text) (end_tag (tag_name)))) + +====================================== +Ruby annotation elements without close tags +====================================== +とうきょう +--- + +(fragment + (element + (start_tag (tag_name)) + (text) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text)) + (end_tag (tag_name)))) + +======================================= +COLGROUP elements without end tags +======================================= + + + + + + + + + +
LimeLemonOrange
+--- + +(fragment + (element + (start_tag (tag_name)) + (element + (start_tag (tag_name)) + (element (start_tag + (tag_name) + (attribute (attribute_name) (quoted_attribute_value (attribute_value))))) + (element (start_tag + (tag_name) + (attribute (attribute_name) (quoted_attribute_value (attribute_value)))))) + (element + (start_tag (tag_name)) + (element (start_tag (tag_name)) (text) (end_tag (tag_name))) + (element (start_tag (tag_name)) (text) (end_tag (tag_name))) + (element (start_tag (tag_name)) (text) (end_tag (tag_name))) + (end_tag (tag_name))) + (end_tag (tag_name)))) + +========================================= +TR, TD, and TH elements without end tags +========================================= + + + +
One + Two +
Three + Four +
+--- + +(fragment + (element + (start_tag (tag_name)) + (element + (start_tag (tag_name)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text))) + (element + (start_tag (tag_name)) + (element (start_tag (tag_name)) (text)) + (element (start_tag (tag_name)) (text))) + (end_tag (tag_name)))) + +============================== +Named entities in tag contents +============================== + +

Lorem ipsum   dolor sit © amet.

+--- + +(fragment + (element + (start_tag (tag_name)) + (text) + (entity) + (text) + (entity) + (text) + (end_tag (tag_name)))) + +================================ +Numeric entities in tag contents +================================ + +

Lorem ipsum   dolor sit — amet.

+--- + +(fragment + (element + (start_tag (tag_name)) + (text) + (entity) + (text) + (entity) + (text) + (end_tag (tag_name)))) + +================================= +Multiple entities in tag contents +================================= + +

Lorem ipsum   dolor   sit   amet.

+--- + +(fragment + (element + (start_tag (tag_name)) + (text) + (entity) + (text) + (entity) + (text) + (entity) + (text) + (end_tag (tag_name)))) diff --git a/html/test/highlight/attributes.html b/html/test/highlight/attributes.html new file mode 100644 index 0000000..4bb627d --- /dev/null +++ b/html/test/highlight/attributes.html @@ -0,0 +1,39 @@ +
+ + + + +
Hello, World
+ +
Hello, World
+ +
Hello, World
+ +
Hello, World
+ +
Hello, World
+ +
Hello, World
+ + + + @click="count++" + + + :value="count" + + + @value:modelValue="newValue => count = newValue" + + + > + + + + + +
+ + + diff --git a/html/test/highlight/doctype.html b/html/test/highlight/doctype.html new file mode 100644 index 0000000..ec1023a --- /dev/null +++ b/html/test/highlight/doctype.html @@ -0,0 +1,6 @@ + + + + + + diff --git a/html/test/highlight/erroneous.html b/html/test/highlight/erroneous.html new file mode 100644 index 0000000..5e13f70 --- /dev/null +++ b/html/test/highlight/erroneous.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/html/test/highlight/self-closing.html b/html/test/highlight/self-closing.html new file mode 100644 index 0000000..cc38caa --- /dev/null +++ b/html/test/highlight/self-closing.html @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + +