This commit is contained in:
xenia 2023-12-11 19:48:31 +01:00
parent bb561d2e33
commit 3088bcfa22
11 changed files with 1434 additions and 0 deletions

View File

@ -7,3 +7,4 @@ Languages:
* nix: https://github.com/nix-community/tree-sitter-nix (MIT)
* python: https://github.com/tree-sitter/tree-sitter-python (MIT)
* rust: https://github.com/tree-sitter/tree-sitter-rust (MIT)
* html: https://github.com/tree-sitter/tree-sitter-html (MIT)

View File

@ -44,6 +44,7 @@
packages.nix = compile-tree-sitter { src = ./nix; name = "nix"; };
packages.python = compile-tree-sitter { src = ./python; name = "python"; };
packages.rust = compile-tree-sitter { src = ./rust; name = "rust"; };
packages.html = compile-tree-sitter { src = ./html; name = "html"; };
}
);
}

21
html/LICENSE Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2014 Max Brunsfeld
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

143
html/grammar.js Normal file
View File

@ -0,0 +1,143 @@
/**
* @file HTML grammar for tree-sitter
* @author Max Brunsfeld
* @license MIT
*/
/* eslint-disable arrow-parens */
/* eslint-disable camelcase */
/* eslint-disable-next-line spaced-comment */
/// <reference types="tree-sitter-cli/dsl" />
// @ts-check
module.exports = grammar({
name: 'html',
extras: $ => [
$.comment,
/\s+/,
],
externals: $ => [
$._start_tag_name,
$._script_start_tag_name,
$._style_start_tag_name,
$._end_tag_name,
$.erroneous_end_tag_name,
'/>',
$._implicit_end_tag,
$.raw_text,
$.comment,
],
rules: {
fragment: $ => repeat($._node),
doctype: $ => seq(
'<!',
alias($._doctype, 'doctype'),
/[^>]+/,
'>',
),
_doctype: _ => /[Dd][Oo][Cc][Tt][Yy][Pp][Ee]/,
_node: $ => choice(
$.doctype,
$.entity,
$.text,
$.element,
$.script_element,
$.style_element,
$.erroneous_end_tag,
),
element: $ => choice(
seq(
$.start_tag,
repeat($._node),
choice($.end_tag, $._implicit_end_tag),
),
$.self_closing_tag,
),
script_element: $ => seq(
alias($.script_start_tag, $.start_tag),
optional($.raw_text),
$.end_tag,
),
style_element: $ => seq(
alias($.style_start_tag, $.start_tag),
optional($.raw_text),
$.end_tag,
),
start_tag: $ => seq(
'<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute),
'>',
),
script_start_tag: $ => seq(
'<',
alias($._script_start_tag_name, $.tag_name),
repeat($.attribute),
'>',
),
style_start_tag: $ => seq(
'<',
alias($._style_start_tag_name, $.tag_name),
repeat($.attribute),
'>',
),
self_closing_tag: $ => seq(
'<',
alias($._start_tag_name, $.tag_name),
repeat($.attribute),
'/>',
),
end_tag: $ => seq(
'</',
alias($._end_tag_name, $.tag_name),
'>',
),
erroneous_end_tag: $ => seq(
'</',
$.erroneous_end_tag_name,
'>',
),
attribute: $ => seq(
$.attribute_name,
optional(seq(
'=',
choice(
$.attribute_value,
$.quoted_attribute_value,
),
)),
),
attribute_name: _ => /[^<>"'/=\s]+/,
attribute_value: _ => /[^<>"'=\s]+/,
// An entity can be named, numeric (decimal), or numeric (hexacecimal). The
// longest entity name is 29 characters long, and the HTML spec says that
// no more will ever be added.
entity: _ => /&(#([xX][0-9a-fA-F]{1,6}|[0-9]{1,5})|[A-Za-z]{1,30});/,
quoted_attribute_value: $ => choice(
seq('\'', optional(alias(/[^']+/, $.attribute_value)), '\''),
seq('"', optional(alias(/[^"]+/, $.attribute_value)), '"'),
),
text: _ => /[^<>&\s]([^<>&]*[^<>&\s])?/,
},
});

452
html/src/scanner.c Normal file
View File

@ -0,0 +1,452 @@
#include "tag.h"
#include <wctype.h>
enum TokenType {
START_TAG_NAME,
SCRIPT_START_TAG_NAME,
STYLE_START_TAG_NAME,
END_TAG_NAME,
ERRONEOUS_END_TAG_NAME,
SELF_CLOSING_TAG_DELIMITER,
IMPLICIT_END_TAG,
RAW_TEXT,
COMMENT
};
typedef struct {
uint32_t len;
uint32_t cap;
Tag *data;
} tags_vec;
typedef struct {
tags_vec tags;
} Scanner;
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
if ((_cap) > (vec).cap && (_cap) > 0) { \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
(vec).cap = (_cap); \
}
#define VEC_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
VEC_RESIZE((vec), (_cap)); \
}
#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define VEC_POP(vec) \
{ \
if (VEC_BACK(vec).type == CUSTOM) { \
tag_free(&VEC_BACK(vec)); \
} \
(vec).len--; \
}
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}
#define VEC_CLEAR(vec) \
{ \
for (int i = 0; i < (vec).len; i++) { \
tag_free(&(vec).data[i]); \
} \
(vec).len = 0; \
}
#define STRING_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
memset((vec).data + (vec).len, 0, \
(((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \
(vec).cap = (_cap);
#define STRING_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
STRING_RESIZE((vec), (_cap)); \
}
#define STRING_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define STRING_INIT(vec) \
{ \
(vec).data = calloc(1, sizeof(char) * 17); \
(vec).len = 0; \
(vec).cap = 16; \
}
#define STRING_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
(vec).data = NULL; \
}
#define STRING_CLEAR(vec) \
{ \
(vec).len = 0; \
memset((vec).data, 0, (vec).cap * sizeof(char)); \
}
static unsigned serialize(Scanner *scanner, char *buffer) {
uint16_t tag_count =
scanner->tags.len > UINT16_MAX ? UINT16_MAX : scanner->tags.len;
uint16_t serialized_tag_count = 0;
unsigned size = sizeof(tag_count);
memcpy(&buffer[size], &tag_count, sizeof(tag_count));
size += sizeof(tag_count);
for (; serialized_tag_count < tag_count; serialized_tag_count++) {
Tag tag = scanner->tags.data[serialized_tag_count];
if (tag.type == CUSTOM) {
unsigned name_length = tag.custom_tag_name.len;
if (name_length > UINT8_MAX) {
name_length = UINT8_MAX;
}
if (size + 2 + name_length >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
break;
}
buffer[size++] = (char)tag.type;
buffer[size++] = (char)name_length;
strncpy(&buffer[size], tag.custom_tag_name.data, name_length);
size += name_length;
} else {
if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
break;
}
buffer[size++] = (char)tag.type;
}
}
memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
return size;
}
static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
VEC_CLEAR(scanner->tags);
if (length > 0) {
unsigned size = 0;
uint16_t tag_count = 0;
uint16_t serialized_tag_count = 0;
memcpy(&serialized_tag_count, &buffer[size],
sizeof(serialized_tag_count));
size += sizeof(serialized_tag_count);
memcpy(&tag_count, &buffer[size], sizeof(tag_count));
size += sizeof(tag_count);
VEC_RESIZE(scanner->tags, tag_count);
if (tag_count > 0) {
unsigned iter = 0;
for (iter = 0; iter < serialized_tag_count; iter++) {
Tag tag = scanner->tags.data[iter];
tag.type = (TagType)buffer[size++];
if (tag.type == CUSTOM) {
uint16_t name_length = (uint8_t)buffer[size++];
tag.custom_tag_name.len = name_length;
tag.custom_tag_name.cap = name_length;
tag.custom_tag_name.data =
(char *)calloc(1, sizeof(char) * (name_length + 1));
strncpy(tag.custom_tag_name.data, &buffer[size],
name_length);
size += name_length;
}
VEC_PUSH(scanner->tags, tag);
}
// add zero tags if we didn't read enough, this is because the
// buffer had no more room but we held more tags.
for (; iter < tag_count; iter++) {
Tag tag = new_tag();
VEC_PUSH(scanner->tags, tag);
}
}
}
}
static String scan_tag_name(TSLexer *lexer) {
String tag_name;
STRING_INIT(tag_name);
while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' ||
lexer->lookahead == ':') {
STRING_PUSH(tag_name, towupper(lexer->lookahead));
lexer->advance(lexer, false);
}
return tag_name;
}
static bool scan_comment(TSLexer *lexer) {
if (lexer->lookahead != '-') {
return false;
}
lexer->advance(lexer, false);
if (lexer->lookahead != '-') {
return false;
}
lexer->advance(lexer, false);
unsigned dashes = 0;
while (lexer->lookahead) {
switch (lexer->lookahead) {
case '-':
++dashes;
break;
case '>':
if (dashes >= 2) {
lexer->result_symbol = COMMENT;
lexer->advance(lexer, false);
lexer->mark_end(lexer);
return true;
}
default:
dashes = 0;
}
lexer->advance(lexer, false);
}
return false;
}
static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
if (scanner->tags.len == 0) {
return false;
}
lexer->mark_end(lexer);
const char *end_delimiter =
VEC_BACK(scanner->tags).type == SCRIPT ? "</SCRIPT" : "</STYLE";
unsigned delimiter_index = 0;
while (lexer->lookahead) {
if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
delimiter_index++;
if (delimiter_index == strlen(end_delimiter)) {
break;
}
lexer->advance(lexer, false);
} else {
delimiter_index = 0;
lexer->advance(lexer, false);
lexer->mark_end(lexer);
}
}
lexer->result_symbol = RAW_TEXT;
return true;
}
static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
Tag *parent = scanner->tags.len == 0 ? NULL : &VEC_BACK(scanner->tags);
bool is_closing_tag = false;
if (lexer->lookahead == '/') {
is_closing_tag = true;
lexer->advance(lexer, false);
} else {
if (parent && is_void(parent)) {
VEC_POP(scanner->tags);
lexer->result_symbol = IMPLICIT_END_TAG;
return true;
}
}
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);
return false;
}
Tag next_tag = for_name(tag_name.data);
if (is_closing_tag) {
// The tag correctly closes the topmost element on the stack
if (scanner->tags.len > 0 &&
tagcmp(&VEC_BACK(scanner->tags), &next_tag)) {
STRING_FREE(tag_name);
tag_free(&next_tag);
return false;
}
// Otherwise, dig deeper and queue implicit end tags (to be nice in
// the case of malformed HTML)
for (unsigned i = scanner->tags.len; i > 0; i--) {
if (scanner->tags.data[i - 1].type == next_tag.type) {
VEC_POP(scanner->tags);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}
}
} else if (parent && !can_contain(parent, &next_tag)) {
VEC_POP(scanner->tags);
lexer->result_symbol = IMPLICIT_END_TAG;
STRING_FREE(tag_name);
tag_free(&next_tag);
return true;
}
STRING_FREE(tag_name);
tag_free(&next_tag);
return false;
}
static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
VEC_PUSH(scanner->tags, tag);
switch (tag.type) {
case SCRIPT:
lexer->result_symbol = SCRIPT_START_TAG_NAME;
break;
case STYLE:
lexer->result_symbol = STYLE_START_TAG_NAME;
break;
default:
lexer->result_symbol = START_TAG_NAME;
break;
}
STRING_FREE(tag_name);
return true;
}
static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
String tag_name = scan_tag_name(lexer);
if (tag_name.len == 0) {
STRING_FREE(tag_name);
return false;
}
Tag tag = for_name(tag_name.data);
if (scanner->tags.len > 0 && tagcmp(&VEC_BACK(scanner->tags), &tag)) {
VEC_POP(scanner->tags);
lexer->result_symbol = END_TAG_NAME;
} else {
lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
}
tag_free(&tag);
STRING_FREE(tag_name);
return true;
}
static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
lexer->advance(lexer, false);
if (lexer->lookahead == '>') {
lexer->advance(lexer, false);
if (scanner->tags.len > 0) {
VEC_POP(scanner->tags);
lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
}
return true;
}
return false;
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] &&
!valid_symbols[END_TAG_NAME]) {
return scan_raw_text(scanner, lexer);
}
while (iswspace(lexer->lookahead)) {
lexer->advance(lexer, true);
}
switch (lexer->lookahead) {
case '<':
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '!') {
lexer->advance(lexer, false);
return scan_comment(lexer);
}
if (valid_symbols[IMPLICIT_END_TAG]) {
return scan_implicit_end_tag(scanner, lexer);
}
break;
case '\0':
if (valid_symbols[IMPLICIT_END_TAG]) {
return scan_implicit_end_tag(scanner, lexer);
}
break;
case '/':
if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
return scan_self_closing_tag_delimiter(scanner, lexer);
}
break;
default:
if ((valid_symbols[START_TAG_NAME] ||
valid_symbols[END_TAG_NAME]) &&
!valid_symbols[RAW_TEXT]) {
return valid_symbols[START_TAG_NAME]
? scan_start_tag_name(scanner, lexer)
: scan_end_tag_name(scanner, lexer);
}
}
return false;
}
void *tree_sitter_html_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
return scanner;
}
bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
unsigned tree_sitter_html_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
return serialize(scanner, buffer);
}
void tree_sitter_html_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
deserialize(scanner, buffer, length);
}
void tree_sitter_html_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
for (unsigned i = 0; i < scanner->tags.len; i++) {
STRING_FREE(scanner->tags.data[i].custom_tag_name);
}
VEC_FREE(scanner->tags);
free(scanner);
}

384
html/src/tag.h Normal file
View File

@ -0,0 +1,384 @@
#include "tree_sitter/parser.h"
#include <assert.h>
#include <string.h>
typedef enum {
AREA,
BASE,
BASEFONT,
BGSOUND,
BR,
COL,
COMMAND,
EMBED,
FRAME,
HR,
IMAGE,
IMG,
INPUT,
ISINDEX,
KEYGEN,
LINK,
MENUITEM,
META,
NEXTID,
PARAM,
SOURCE,
TRACK,
WBR,
END_OF_VOID_TAGS,
A,
ABBR,
ADDRESS,
ARTICLE,
ASIDE,
AUDIO,
B,
BDI,
BDO,
BLOCKQUOTE,
BODY,
BUTTON,
CANVAS,
CAPTION,
CITE,
CODE,
COLGROUP,
DATA,
DATALIST,
DD,
DEL,
DETAILS,
DFN,
DIALOG,
DIV,
DL,
DT,
EM,
FIELDSET,
FIGCAPTION,
FIGURE,
FOOTER,
FORM,
H1,
H2,
H3,
H4,
H5,
H6,
HEAD,
HEADER,
HGROUP,
HTML,
I,
IFRAME,
INS,
KBD,
LABEL,
LEGEND,
LI,
MAIN,
MAP,
MARK,
MATH,
MENU,
METER,
NAV,
NOSCRIPT,
OBJECT,
OL,
OPTGROUP,
OPTION,
OUTPUT,
P,
PICTURE,
PRE,
PROGRESS,
Q,
RB,
RP,
RT,
RTC,
RUBY,
S,
SAMP,
SCRIPT,
SECTION,
SELECT,
SLOT,
SMALL,
SPAN,
STRONG,
STYLE,
SUB,
SUMMARY,
SUP,
SVG,
TABLE,
TBODY,
TD,
TEMPLATE,
TEXTAREA,
TFOOT,
TH,
THEAD,
TIME,
TITLE,
TR,
U,
UL,
VAR,
VIDEO,
CUSTOM,
END_,
} TagType;
typedef struct {
uint32_t len;
uint32_t cap;
char *data;
} String;
typedef struct {
char tag_name[16];
TagType tag_value;
} TagMap;
typedef struct {
TagType type;
String custom_tag_name;
} Tag;
const TagMap TAG_TYPES_BY_TAG_NAME[126] = {
{"AREA", AREA },
{"BASE", BASE },
{"BASEFONT", BASEFONT },
{"BGSOUND", BGSOUND },
{"BR", BR },
{"COL", COL },
{"COMMAND", COMMAND },
{"EMBED", EMBED },
{"FRAME", FRAME },
{"HR", HR },
{"IMAGE", IMAGE },
{"IMG", IMG },
{"INPUT", INPUT },
{"ISINDEX", ISINDEX },
{"KEYGEN", KEYGEN },
{"LINK", LINK },
{"MENUITEM", MENUITEM },
{"META", META },
{"NEXTID", NEXTID },
{"PARAM", PARAM },
{"SOURCE", SOURCE },
{"TRACK", TRACK },
{"WBR", WBR },
{"A", A },
{"ABBR", ABBR },
{"ADDRESS", ADDRESS },
{"ARTICLE", ARTICLE },
{"ASIDE", ASIDE },
{"AUDIO", AUDIO },
{"B", B },
{"BDI", BDI },
{"BDO", BDO },
{"BLOCKQUOTE", BLOCKQUOTE},
{"BODY", BODY },
{"BUTTON", BUTTON },
{"CANVAS", CANVAS },
{"CAPTION", CAPTION },
{"CITE", CITE },
{"CODE", CODE },
{"COLGROUP", COLGROUP },
{"DATA", DATA },
{"DATALIST", DATALIST },
{"DD", DD },
{"DEL", DEL },
{"DETAILS", DETAILS },
{"DFN", DFN },
{"DIALOG", DIALOG },
{"DIV", DIV },
{"DL", DL },
{"DT", DT },
{"EM", EM },
{"FIELDSET", FIELDSET },
{"FIGCAPTION", FIGCAPTION},
{"FIGURE", FIGURE },
{"FOOTER", FOOTER },
{"FORM", FORM },
{"H1", H1 },
{"H2", H2 },
{"H3", H3 },
{"H4", H4 },
{"H5", H5 },
{"H6", H6 },
{"HEAD", HEAD },
{"HEADER", HEADER },
{"HGROUP", HGROUP },
{"HTML", HTML },
{"I", I },
{"IFRAME", IFRAME },
{"INS", INS },
{"KBD", KBD },
{"LABEL", LABEL },
{"LEGEND", LEGEND },
{"LI", LI },
{"MAIN", MAIN },
{"MAP", MAP },
{"MARK", MARK },
{"MATH", MATH },
{"MENU", MENU },
{"METER", METER },
{"NAV", NAV },
{"NOSCRIPT", NOSCRIPT },
{"OBJECT", OBJECT },
{"OL", OL },
{"OPTGROUP", OPTGROUP },
{"OPTION", OPTION },
{"OUTPUT", OUTPUT },
{"P", P },
{"PICTURE", PICTURE },
{"PRE", PRE },
{"PROGRESS", PROGRESS },
{"Q", Q },
{"RB", RB },
{"RP", RP },
{"RT", RT },
{"RTC", RTC },
{"RUBY", RUBY },
{"S", S },
{"SAMP", SAMP },
{"SCRIPT", SCRIPT },
{"SECTION", SECTION },
{"SELECT", SELECT },
{"SLOT", SLOT },
{"SMALL", SMALL },
{"SPAN", SPAN },
{"STRONG", STRONG },
{"STYLE", STYLE },
{"SUB", SUB },
{"SUMMARY", SUMMARY },
{"SUP", SUP },
{"SVG", SVG },
{"TABLE", TABLE },
{"TBODY", TBODY },
{"TD", TD },
{"TEMPLATE", TEMPLATE },
{"TEXTAREA", TEXTAREA },
{"TFOOT", TFOOT },
{"TH", TH },
{"THEAD", THEAD },
{"TIME", TIME },
{"TITLE", TITLE },
{"TR", TR },
{"U", U },
{"UL", UL },
{"VAR", VAR },
{"VIDEO", VIDEO },
{"CUSTOM", CUSTOM },
};
static const TagType TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[] = {
ADDRESS, ARTICLE, ASIDE, BLOCKQUOTE, DETAILS, DIV, DL,
FIELDSET, FIGCAPTION, FIGURE, FOOTER, FORM, H1, H2,
H3, H4, H5, H6, HEADER, HR, MAIN,
NAV, OL, P, PRE, SECTION,
};
static TagType get_tag_from_string(const char *tag_name) {
for (int i = 0; i < 126; i++) {
if (strcmp(TAG_TYPES_BY_TAG_NAME[i].tag_name, tag_name) == 0) {
return TAG_TYPES_BY_TAG_NAME[i].tag_value;
}
}
return CUSTOM;
}
static inline Tag new_tag() {
Tag tag;
tag.type = END_;
tag.custom_tag_name.data = NULL;
tag.custom_tag_name.len = 0;
tag.custom_tag_name.cap = 0;
return tag;
}
static Tag make_tag(TagType type, const char *name) {
Tag tag = new_tag();
tag.type = type;
if (type == CUSTOM) {
tag.custom_tag_name.len = strlen(name);
tag.custom_tag_name.data =
(char *)calloc(1, sizeof(char) * (tag.custom_tag_name.len + 1));
strncpy(tag.custom_tag_name.data, name, tag.custom_tag_name.len);
}
return tag;
}
static inline void tag_free(Tag *tag) {
if (tag->type == CUSTOM) {
free(tag->custom_tag_name.data);
}
tag->custom_tag_name.data = NULL;
}
static inline bool is_void(const Tag *tag) {
return tag->type < END_OF_VOID_TAGS;
}
static inline Tag for_name(const char *name) {
return make_tag(get_tag_from_string(name), name);
}
static inline bool tagcmp(const Tag *_tag1, const Tag *_tag2) {
return _tag1->type == _tag2->type &&
(_tag1->type == CUSTOM ? strcmp(_tag1->custom_tag_name.data,
_tag2->custom_tag_name.data) == 0
: true);
}
static bool can_contain(Tag *self, const Tag *other) {
TagType child = other->type;
switch (self->type) {
case LI:
return child != LI;
case DT:
case DD:
return child != DT && child != DD;
case P:
for (int i = 0; i < 26; i++) {
if (child == TAG_TYPES_NOT_ALLOWED_IN_PARAGRAPHS[i]) {
return false;
}
}
return true;
case COLGROUP:
return child == COL;
case RB:
case RT:
case RP:
return child != RB && child != RT && child != RP;
case OPTGROUP:
return child != OPTGROUP;
case TR:
return child != TR;
case TD:
case TH:
return child != TD && child != TH && child != TR;
default:
return true;
}
}

365
html/test/corpus/main.txt Normal file
View File

@ -0,0 +1,365 @@
===================================
Tags
===================================
<span>Hello</span>
---
(fragment
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name))))
===================================
Tags with attributes
===================================
<input value=yes class="a" data-💩></input>
---
(fragment
(element
(start_tag
(tag_name)
(attribute
(attribute_name)
(attribute_value))
(attribute
(attribute_name)
(quoted_attribute_value (attribute_value)))
(attribute
(attribute_name)))
(end_tag (tag_name))))
===================================
Nested tags
===================================
<div>
<span>a</span>
b
<b>c</b>
Multi-line
text
</div>
---
(fragment
(element
(start_tag (tag_name))
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name)))
(text)
(element
(start_tag (tag_name))
(text)
(end_tag (tag_name)))
(text)
(end_tag (tag_name))))
==================================
Void tags
==================================
<form><img src="something.png"><br><input type=submit value=Ok /></form>
---
(fragment
(element
(start_tag (tag_name))
(element
(start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag (tag_name)))
(element
(self_closing_tag
(tag_name)
(attribute (attribute_name) (attribute_value))
(attribute (attribute_name) (attribute_value))))
(end_tag (tag_name))))
==================================
Void tags at EOF
==================================
<img src="something.png">
---
(fragment
(element
(start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
==================================
Custom tags
==================================
<something:different>
<atom-text-editor mini>
Hello
</atom-text-editor>
</something:different>
---
(fragment
(element
(start_tag (tag_name))
(element
(start_tag (tag_name) (attribute (attribute_name)))
(text)
(end_tag (tag_name)))
(end_tag (tag_name))))
==================================
Comments
==================================
<!-- hello -->
<!-- world ->-> -- > ->->->-- -> still comment -->
<div>
<!-- <span>something</span> -->
</div>
---
(fragment
(comment)
(comment)
(element
(start_tag (tag_name))
(comment)
(end_tag (tag_name))))
==================================
Raw text elements
==================================
<script>
</s
</sc
</scr
</scri
</scrip
</script>
<style>
</ </s </st </sty </styl
</style>
<script>
</SCRIPT>
---
(fragment
(script_element
(start_tag (tag_name))
(raw_text)
(end_tag (tag_name)))
(style_element
(start_tag (tag_name))
(raw_text)
(end_tag (tag_name)))
(script_element
(start_tag (tag_name))
(raw_text)
(end_tag (tag_name))))
==================================
All-caps doctype
==================================
<!DOCTYPE html PUBLIC
"-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
---
(fragment
(doctype))
==================================
Lowercase doctype
==================================
<!doctype html>
---
(fragment
(doctype))
==================================
LI elements without close tags
==================================
<ul>
<li>One
<li>Two
</ul>
---
(fragment
(element
(start_tag (tag_name))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
======================================
DT and DL elements without close tags
======================================
<dl>
<dt>Coffee
<dt>Café
<dd>Black hot drink
<dt>Milk
<dd>White cold drink
</dl>
---
(fragment
(element
(start_tag (tag_name))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
======================================
P elements without close tags
======================================
<p>One
<div>Two</div>
<p>Three
<p>Four
<h1>Five</h1>
---
(fragment
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text) (end_tag (tag_name))))
======================================
Ruby annotation elements without close tags
======================================
<ruby>東<rb>京<rt>とう<rt>きょう</ruby>
---
(fragment
(element
(start_tag (tag_name))
(text)
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text))
(end_tag (tag_name))))
=======================================
COLGROUP elements without end tags
=======================================
<table>
<colgroup>
<col style="background-color: #0f0">
<col span="2">
<tr>
<th>Lime</th>
<th>Lemon</th>
<th>Orange</th>
</tr>
</table>
---
(fragment
(element
(start_tag (tag_name))
(element
(start_tag (tag_name))
(element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value)))))
(element (start_tag
(tag_name)
(attribute (attribute_name) (quoted_attribute_value (attribute_value))))))
(element
(start_tag (tag_name))
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(element (start_tag (tag_name)) (text) (end_tag (tag_name)))
(end_tag (tag_name)))
(end_tag (tag_name))))
=========================================
TR, TD, and TH elements without end tags
=========================================
<table>
<tr>
<th>One
<th>Two
<tr>
<td>Three
<td>Four
</table>
---
(fragment
(element
(start_tag (tag_name))
(element
(start_tag (tag_name))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text)))
(element
(start_tag (tag_name))
(element (start_tag (tag_name)) (text))
(element (start_tag (tag_name)) (text)))
(end_tag (tag_name))))
==============================
Named entities in tag contents
==============================
<p>Lorem ipsum &nbsp; dolor sit &copy; amet.</p>
---
(fragment
(element
(start_tag (tag_name))
(text)
(entity)
(text)
(entity)
(text)
(end_tag (tag_name))))
================================
Numeric entities in tag contents
================================
<p>Lorem ipsum &#160; dolor sit &#8212; amet.</p>
---
(fragment
(element
(start_tag (tag_name))
(text)
(entity)
(text)
(entity)
(text)
(end_tag (tag_name))))
=================================
Multiple entities in tag contents
=================================
<p>Lorem ipsum &#xA0; dolor &#xa0; sit &nbsp; amet.</p>
---
(fragment
(element
(start_tag (tag_name))
(text)
(entity)
(text)
(entity)
(text)
(entity)
(text)
(end_tag (tag_name))))

View File

@ -0,0 +1,39 @@
<div style="display: flex" draggable>
<!-- <- tag -->
<!--^^^^ attribute -->
<!-- ^^^^^^^^^^^^^ string -->
<!-- ^^^^^^^^^ attribute -->
<div onclick=tap>Hello, World</div>
<!-- <- punctuation.bracket -->
<div onclick=tap>Hello, World</div>
<!-- ^^^^^^^^ attribute -->
<div onclick=tap>Hello, World</div>
<!-- ^^^ string -->
<div onclick=tap>Hello, World</div>
<!-- ^^ punctuation.bracket -->
<div onclick="tap">Hello, World</div>
<!-- ^^^ tag -->
<div onclick="tap">Hello, World</div>
<!-- ^ punctuation.bracket -->
<something:different
<!-- <- punctuation.bracket -->
<!-- ^^^^^^^^^^^^^^^ tag -->
@click="count++"
<!--^^^^^^^ attribute -->
<!-- ^^^^^^^ string -->
:value="count"
<!--^^^^^^^ attribute -->
<!-- ^^^^^ string -->
@value:modelValue="newValue => count = newValue"
<!--^^^^^^^^^^^^^^^^^^ attribute -->
<!-- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ string -->
>
<!-- <- punctuation.bracket -->
</something:different>
<!-- <- punctuation.bracket -->
<!-- ^^^^^^^^^^^^^^^^ tag -->
<!-- ^ punctuation.bracket -->
</div>
<!-- <- punctuation.bracket -->
<!--^ tag -->
<!-- ^ punctuation.bracket -->

View File

@ -0,0 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- ^^^^^^^^^^^^^^^^^^^^ constant -->
<!DOCTYPE html>
<!-- ^^^^ constant -->
<!-- ^ punctuation.bracket -->

View File

@ -0,0 +1,7 @@
<span>
<!-- <- punctuation.bracket -->
<!--^ tag -->
</div>
<!--^ tag.error -->
<!-- ^ punctuation.bracket -->

View File

@ -0,0 +1,15 @@
<input />
<!-- <- punctuation.bracket -->
<!-- ^ tag -->
<input type="submit" readonly />
<!-- ^^^^ attribute -->
<input type="submit" readonly />
<!-- ^^^^^^ string -->
<input type="submit" readonly />
<!-- ^^^^^^^^ attribute -->
<input type="submit" readonly />
<!-- ^^ punctuation.bracket -->