2023-11-06 13:14:30 +00:00
module xml
import io
import os
import strings
2023-11-25 10:02:51 +03:00
const default_prolog_attributes = {
' v e r s i o n ' : ' 1 . 0 '
' e n c o d i n g ' : ' U T F - 8 '
}
const default_string_builder_cap = 32
2023-11-06 13:14:30 +00:00
2023-11-25 10:02:51 +03:00
const element_len = ' < ! E L E M E N T ' . len
const entity_len = ' < ! E N T I T Y ' . len
2023-11-06 13:14:30 +00:00
2023-11-25 10:02:51 +03:00
const doctype_chars = ' O C T Y P E ' . bytes ( )
const double_dash = ' - - ' . bytes ( )
const c_tag = ' [ C ' . bytes ( )
const data_chars = ' D A T A ' . bytes ( )
2023-11-13 12:24:39 +00:00
2023-11-25 10:02:51 +03:00
const byte_order_marking_first = u8 ( 0xEF )
const byte_order_marking_bytes = [ u8 ( 0xBB ) , 0xBF ]
2023-11-06 13:14:30 +00:00
// Helper types to assist in parsing
struct TextSpan {
mut :
start int
end int
}
enum AttributeParserState {
key
eq
value
}
fn parse_attributes ( attribute_contents string ) ! map [ string ] string {
if attribute_contents . contains_u8 ( ` < ` ) {
return error ( ' M a l f o r m e d X M L . F o u n d " < " i n a t t r i b u t e s t r i n g : " $ { attribute_contents } " ' )
}
mut attributes := map [ string ] string { }
mut state := AttributeParserState . key
mut key_span , mut value_span := TextSpan { } , TextSpan { }
for index , ch in attribute_contents {
match state {
. key {
match ch {
` = ` {
state = AttributeParserState . eq
}
else {
key_span . end ++
}
}
}
. eq {
match ch {
` = ` {
return error ( ' D u p l i c a t e " = " i n a t t r i b u t e s t r i n g : " $ { attribute_contents } " ' )
}
` ' ` , ` " ` {
state = AttributeParserState . value
value_span . start = index + 1
}
else {
return error ( ' I n v a l i d c h a r a c t e r i n a t t r i b u t e s t r i n g : " $ { attribute_contents } " ' )
}
}
}
. value {
match ch {
` ' ` , ` " ` {
state = AttributeParserState . key
value_span . end = index
attributes [ attribute_contents [ key_span . start .. key_span . end ] . trim_space ( ) ] = attribute_contents [ value_span . start .. value_span . end ]
key_span . start = index + 1
key_span . end = index + 1
}
else {
state = AttributeParserState . value
value_span . end ++
}
}
}
}
}
return attributes
}
fn parse_comment ( mut reader io . Reader ) ! XMLComment {
2024-09-10 16:25:56 +08:00
mut comment_buffer := strings . new_builder ( default_string_builder_cap )
2023-11-06 13:14:30 +00:00
mut local_buf := [ u8 ( 0 ) ]
for {
ch := next_char ( mut reader , mut local_buf ) !
match ch {
` - ` {
after_ch := next_char ( mut reader , mut local_buf ) !
if after_ch == ` - ` {
if next_char ( mut reader , mut local_buf ) ! == ` > ` {
break
}
return error ( ' X M L C o m m e n t n o t c l o s e d . E x p e c t e d " > " . ' )
} else {
comment_buffer . write_u8 ( ch )
comment_buffer . write_u8 ( after_ch )
}
}
else {
comment_buffer . write_u8 ( ch )
}
}
}
comment_contents := comment_buffer . str ( )
return XMLComment { comment_contents }
}
enum CDATAParserState {
normal
single
double
}
fn parse_cdata ( mut reader io . Reader ) ! XMLCData {
2024-09-10 16:25:56 +08:00
mut contents_buf := strings . new_builder ( default_string_builder_cap )
2023-11-06 13:14:30 +00:00
mut state := CDATAParserState . normal
mut local_buf := [ u8 ( 0 ) ]
for {
ch := next_char ( mut reader , mut local_buf ) !
contents_buf . write_u8 ( ch )
match ch {
` ] ` {
match state {
. double {
// Another ] after the ]] for some reason. Keep the state
}
. single {
state = . double
}
. normal {
state = . single
}
}
}
` > ` {
match state {
. double {
break
}
else {
state = . normal
}
}
}
else {
state = . normal
}
}
}
contents := contents_buf . str ( ) . trim_space ( )
if ! contents . ends_with ( ' ] ] > ' ) {
return error ( ' C D A T A s e c t i o n n o t c l o s e d . ' )
}
return XMLCData { contents [ 1 .. contents . len - 3 ] }
}
fn parse_entity ( contents string ) ! ( DTDEntity , string ) {
// We find the nearest '>' to the start of the ENTITY
entity_end := contents . index ( ' > ' ) or { return error ( ' E n t i t y d e c l a r a t i o n n o t c l o s e d . ' ) }
2024-09-10 16:25:56 +08:00
entity_contents := contents [ entity_len .. entity_end ]
2023-11-06 13:14:30 +00:00
name := entity_contents . trim_left ( ' \t \n ' ) . all_before ( ' ' )
2024-04-18 01:44:31 +02:00
if name == ' ' {
2023-11-06 13:14:30 +00:00
return error ( ' E n t i t y i s m i s s i n g n a m e . ' )
}
value := entity_contents . all_after_first ( name ) . trim_space ( ) . trim ( ' " \' ' )
if value . len == 0 {
return error ( ' E n t i t y i s m i s s i n g v a l u e . ' )
}
// TODO: Add support for SYSTEM and PUBLIC entities
return DTDEntity { name , value } , contents [ entity_end + 1 .. ]
}
fn parse_element ( contents string ) ! ( DTDElement , string ) {
// We find the nearest '>' to the start of the ELEMENT
element_end := contents . index ( ' > ' ) or { return error ( ' E l e m e n t d e c l a r a t i o n n o t c l o s e d . ' ) }
2025-09-22 21:14:52 +02:00
element_contents := contents [ element_len .. element_end ] . trim_left ( ' \t \r \n ' )
2023-11-06 13:14:30 +00:00
mut name_span := TextSpan { }
for ch in element_contents {
match ch {
` ` , ` \t ` , ` \n ` {
break
}
// Valid characters in an entity name are:
// 1. Lowercase alphabet - a-z
// 2. Uppercase alphabet - A-Z
// 3. Numbers - 0-9
// 4. Underscore - _
// 5. Colon - :
// 6. Period - .
` a ` ... ` z ` , ` A ` ... ` Z ` , ` 0 ` ... ` 9 ` , ` _ ` , ` : ` , ` . ` {
name_span . end ++
}
else {
return error ( ' I n v a l i d c h a r a c t e r i n e l e m e n t n a m e : " $ { ch } " ' )
}
}
}
name := element_contents [ name_span . start .. name_span . end ] . trim_left ( ' \t \n ' )
2024-04-18 01:44:31 +02:00
if name == ' ' {
2023-11-06 13:14:30 +00:00
return error ( ' E l e m e n t i s m i s s i n g n a m e . ' )
}
definition_string := element_contents . all_after_first ( name ) . trim_space ( ) . trim ( ' " \' ' )
definition := if definition_string . starts_with ( ' ( ' ) {
// We have a list of possible children
// Ensure that both ( and ) are present
if ! definition_string . ends_with ( ' ) ' ) {
return error ( ' E l e m e n t d e c l a r a t i o n n o t c l o s e d . ' )
}
definition_string . trim ( ' ( ) ' ) . split ( ' , ' )
} else {
// Invalid definition
return error ( ' I n v a l i d e l e m e n t d e f i n i t i o n : $ { definition_string } ' )
}
// TODO: Add support for SYSTEM and PUBLIC entities
return DTDElement { name , definition } , contents [ element_end + 1 .. ]
}
fn parse_doctype ( mut reader io . Reader ) ! DocumentType {
// We may have more < in the doctype so keep count
mut depth := 1
2024-09-10 16:25:56 +08:00
mut doctype_buffer := strings . new_builder ( default_string_builder_cap )
2023-11-06 13:14:30 +00:00
mut local_buf := [ u8 ( 0 ) ]
for {
ch := next_char ( mut reader , mut local_buf ) !
doctype_buffer . write_u8 ( ch )
match ch {
` < ` {
depth ++
}
` > ` {
depth --
if depth == 0 {
break
}
}
else { }
}
}
doctype_contents := doctype_buffer . str ( ) . trim_space ( )
name := doctype_contents . all_before ( ' [ ' ) . trim_space ( )
mut list_contents := doctype_contents . all_after ( ' [ ' ) . all_before ( ' ] ' ) . trim_space ( )
mut items := [ ] DTDListItem { }
for list_contents . len > 0 {
if list_contents . starts_with ( ' < ! E N T I T Y ' ) {
entity , remaining := parse_entity ( list_contents ) !
items << entity
list_contents = remaining . trim_space ( )
} else if list_contents . starts_with ( ' < ! E L E M E N T ' ) {
element , remaining := parse_element ( list_contents ) !
items << element
list_contents = remaining . trim_space ( )
} else {
return error ( ' U n k n o w n D O C T Y P E l i s t i t e m : $ { list_contents } ' )
}
}
return DocumentType {
name : name
2024-08-11 14:11:24 +08:00
dtd : DocumentTypeDefinition {
2023-11-06 13:14:30 +00:00
list : items
}
}
}
fn parse_prolog ( mut reader io . Reader ) ! ( Prolog , u8 ) {
2023-11-13 12:24:39 +00:00
// Skip trailing whitespace and invalid characters
2023-11-06 13:14:30 +00:00
mut local_buf := [ u8 ( 0 ) ]
mut ch := next_char ( mut reader , mut local_buf ) !
for {
match ch {
2023-11-13 12:24:39 +00:00
` ` , ` \t ` , ` \r ` , ` \n ` {
2023-11-06 13:14:30 +00:00
ch = next_char ( mut reader , mut local_buf ) !
continue
}
` < ` {
break
}
2024-09-10 16:25:56 +08:00
byte_order_marking_first {
2023-11-13 12:24:39 +00:00
// UTF-8 BOM
mut bom_buf := [ u8 ( 0 ) , 0 ]
if reader . read ( mut bom_buf ) ! != 2 {
return error ( ' I n v a l i d U T F - 8 B O M . ' )
}
2024-09-10 16:25:56 +08:00
if bom_buf != byte_order_marking_bytes {
2023-11-13 12:24:39 +00:00
return error ( ' I n v a l i d U T F - 8 B O M . ' )
}
ch = next_char ( mut reader , mut local_buf ) !
continue
}
2023-11-06 13:14:30 +00:00
else {
return error ( ' E x p e c t i n g a p r o l o g o r r o o t n o d e s t a r t i n g w i t h " < " . ' )
}
}
}
ch = next_char ( mut reader , mut local_buf ) !
if ch != ` ? ` {
return Prolog { } , ch
}
ch = next_char ( mut reader , mut local_buf ) !
if ch != ` x ` {
return error ( ' E x p e c t i n g a p r o l o g s t a r t i n g w i t h " < ? x " . ' )
}
ch = next_char ( mut reader , mut local_buf ) !
if ch != ` m ` {
return error ( ' E x p e c t i n g a p r o l o g s t a r t i n g w i t h " < ? x m " . ' )
}
ch = next_char ( mut reader , mut local_buf ) !
if ch != ` l ` {
return error ( ' E x p e c t i n g a p r o l o g s t a r t i n g w i t h " < ? x m l " . ' )
}
2024-09-10 16:25:56 +08:00
mut prolog_buffer := strings . new_builder ( default_string_builder_cap )
2023-11-06 13:14:30 +00:00
// Keep reading character by character until we find the end of the prolog
mut found_question_mark := false
for {
ch = next_char ( mut reader , mut local_buf ) !
match ch {
` ? ` {
if found_question_mark {
return error ( ' I n v a l i d p r o l o g : T w o q u e s t i o n m a r k s f o u n d i n a r o w . ' )
}
found_question_mark = true
}
` > ` {
if found_question_mark {
break
}
return error ( ' I n v a l i d p r o l o g : F o u n d " > " b e f o r e " ? " . ' )
}
else {
if found_question_mark {
found_question_mark = false
prolog_buffer . write_u8 ( ` ? ` )
}
prolog_buffer . write_u8 ( ch )
}
}
}
prolog_attributes := prolog_buffer . str ( ) . trim_space ( )
attributes := if prolog_attributes . len == 0 {
2024-09-10 16:25:56 +08:00
default_prolog_attributes
2023-11-06 13:14:30 +00:00
} else {
parse_attributes ( prolog_attributes ) !
}
version := attributes [ ' v e r s i o n ' ] or { return error ( ' X M L d e c l a r a t i o n m i s s i n g v e r s i o n . ' ) }
encoding := attributes [ ' e n c o d i n g ' ] or { ' U T F - 8 ' }
mut comments := [ ] XMLComment { }
mut doctype := DocumentType {
name : ' '
2024-08-11 14:11:24 +08:00
dtd : ' '
2023-11-06 13:14:30 +00:00
}
mut found_doctype := false
for {
ch = next_char ( mut reader , mut local_buf ) !
match ch {
` ` , ` \t ` , ` \n ` {
continue
}
` < ` {
// We have a comment, DOCTYPE, or root node
ch = next_char ( mut reader , mut local_buf ) !
match ch {
` ! ` {
// A comment or DOCTYPE
match next_char ( mut reader , mut local_buf ) ! {
` - ` {
// A comment
if next_char ( mut reader , mut local_buf ) ! != ` - ` {
return error ( ' I n v a l i d c o m m e n t . ' )
}
comments << parse_comment ( mut reader ) !
}
` D ` {
if found_doctype {
return error ( ' D u p l i c a t e D O C T Y P E d e c l a r a t i o n . ' )
}
// <!D -> OCTYPE
mut doc_buf := [ ] u8 { len : 6 }
if reader . read ( mut doc_buf ) ! != 6 {
return error ( ' I n v a l i d D O C T Y P E . ' )
}
2024-09-10 16:25:56 +08:00
if doc_buf != doctype_chars {
2023-11-06 13:14:30 +00:00
return error ( ' I n v a l i d D O C T Y P E . ' )
}
found_doctype = true
doctype = parse_doctype ( mut reader ) !
}
else {
return error ( ' U n s u p p o r t e d c o n t r o l s e q u e n c e f o u n d i n p r o l o g . ' )
}
}
}
else {
// We have found the start of the root node
break
}
}
}
else { }
}
}
return Prolog {
2024-08-11 14:11:24 +08:00
version : version
2023-11-06 13:14:30 +00:00
encoding : encoding
2024-08-11 14:11:24 +08:00
doctype : doctype
2023-11-06 13:14:30 +00:00
comments : comments
} , ch
}
fn parse_children ( name string , attributes map [ string ] string , mut reader io . Reader ) ! XMLNode {
2024-09-10 16:25:56 +08:00
mut inner_contents := strings . new_builder ( default_string_builder_cap )
2023-11-06 13:14:30 +00:00
mut children := [ ] XMLNodeContents { }
mut local_buf := [ u8 ( 0 ) ]
for {
ch := next_char ( mut reader , mut local_buf ) !
match ch {
` < ` {
second_char := next_char ( mut reader , mut local_buf ) !
match second_char {
` ! ` {
// Comment, CDATA
mut next_two := [ u8 ( 0 ) , 0 ]
if reader . read ( mut next_two ) ! != 2 {
return error ( ' I n v a l i d X M L . I n c o m p l e t e c o m m e n t o r C D A T A d e c l a r a t i o n . ' )
}
2024-09-10 16:25:56 +08:00
if next_two == double_dash {
2023-11-06 13:14:30 +00:00
// Comment
comment := parse_comment ( mut reader ) !
children << comment
2024-09-10 16:25:56 +08:00
} else if next_two == c_tag {
2023-11-06 13:14:30 +00:00
// <![CDATA -> DATA
mut cdata_buf := [ ] u8 { len : 4 }
if reader . read ( mut cdata_buf ) ! != 4 {
return error ( ' I n v a l i d X M L . I n c o m p l e t e C D A T A d e c l a r a t i o n . ' )
}
2024-09-10 16:25:56 +08:00
if cdata_buf != data_chars {
2023-11-06 13:14:30 +00:00
return error ( ' I n v a l i d X M L . E x p e c t e d " C D A T A " a f t e r " < ! [ C " . ' )
}
cdata := parse_cdata ( mut reader ) !
children << cdata
} else {
return error ( ' I n v a l i d X M L . U n k n o w n c o n t r o l s e q u e n c e : $ { next_two . bytestr ( ) } ' )
}
}
` / ` {
// End of node
mut node_end_buffer := [ ] u8 { len : name . len + 1 }
if reader . read ( mut node_end_buffer ) ! != name . len + 1 {
return error ( ' I n v a l i d X M L . I n c o m p l e t e n o d e e n d . ' )
}
mut ending_chars := name . bytes ( )
ending_chars << ` > `
if node_end_buffer != ending_chars {
return error ( ' X M L n o d e < $ { name } > n o t c l o s e d . ' )
}
collected_contents := inner_contents . str ( ) . trim_space ( )
if collected_contents . len > 0 {
// We have some inner text
children << collected_contents . replace ( ' \r \n ' , ' \n ' )
}
return XMLNode {
2024-08-11 14:11:24 +08:00
name : name
2023-11-06 13:14:30 +00:00
attributes : attributes
2024-08-11 14:11:24 +08:00
children : children
2023-11-06 13:14:30 +00:00
}
}
else {
// Start of child node
child := parse_single_node ( second_char , mut reader ) or {
if err . msg ( ) == ' X M L n o d e c a n n o t s t a r t w i t h " < / " . ' {
return error ( ' X M L n o d e < $ { name } > n o t c l o s e d . ' )
} else {
return err
}
}
text := inner_contents . str ( ) . trim_space ( )
if text . len > 0 {
children << text . replace ( ' \r \n ' , ' \n ' )
}
children << child
}
}
}
else {
inner_contents . write_u8 ( ch )
}
}
}
return error ( ' X M L n o d e < $ { name } > n o t c l o s e d . ' )
}
2023-11-16 18:13:36 +00:00
// parse_single_node parses a single XML node from the reader. The first character of the tag is passed
// in as the first_char parameter.
// This function is meant to assist in parsing nested nodes one at a time. Using this function as
// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
// XML documents without running out of memory.
pub fn parse_single_node ( first_char u8 , mut reader io . Reader ) ! XMLNode {
2024-09-10 16:25:56 +08:00
mut contents := strings . new_builder ( default_string_builder_cap )
2023-11-10 09:31:36 +00:00
contents . write_u8 ( first_char )
2023-11-06 13:14:30 +00:00
2023-11-10 09:31:36 +00:00
mut local_buf := [ u8 ( 0 ) ]
for {
mut ch := next_char ( mut reader , mut local_buf ) !
2023-11-06 13:14:30 +00:00
if ch == ` > ` {
break
}
contents . write_u8 ( ch )
}
tag_contents := contents . str ( ) . trim_space ( )
2025-09-22 21:14:52 +02:00
parts := tag_contents . split_any ( ' \t \r \n ' )
2023-11-17 08:51:46 +00:00
name := parts [ 0 ] . trim_right ( ' / ' )
2023-11-06 13:14:30 +00:00
// Check if it is a self-closing tag
if tag_contents . ends_with ( ' / ' ) {
// We're not looking for children and inner text
return XMLNode {
2024-08-11 14:11:24 +08:00
name : name
2023-11-16 18:13:36 +00:00
attributes : parse_attributes ( tag_contents [ name . len .. tag_contents . len - 1 ] . trim_space ( ) ) !
2023-11-06 13:14:30 +00:00
}
}
2023-11-10 09:31:36 +00:00
attribute_string := tag_contents [ name . len .. ] . trim_space ( )
2023-11-06 13:14:30 +00:00
attributes := parse_attributes ( attribute_string ) !
return parse_children ( name , attributes , mut reader )
}
// XMLDocument .from_string parses an XML document from a string.
pub fn XMLDocument . from_string ( raw_contents string ) ! XMLDocument {
mut reader := FullBufferReader {
contents : raw_contents . bytes ( )
}
return XMLDocument . from_reader ( mut reader ) !
}
// XMLDocument .from_file parses an XML document from a file. Note that the file is read in its entirety
// and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
pub fn XMLDocument . from_file ( path string ) ! XMLDocument {
mut reader := FullBufferReader {
contents : os . read_bytes ( path ) !
}
return XMLDocument . from_reader ( mut reader ) !
}
// XMLDocument .from_reader parses an XML document from a reader. This is the most generic way to parse
// an XML document from any arbitrary source that implements that io.Reader interface.
pub fn XMLDocument . from_reader ( mut reader io . Reader ) ! XMLDocument {
prolog , first_char := parse_prolog ( mut reader ) or {
if err is os . Eof || err is io . Eof || err . msg ( ) == ' U n e x p e c t e d E n d O f F i l e . ' {
return error ( ' X M L d o c u m e n t i s e m p t y . ' )
} else {
return err
}
}
root := parse_single_node ( first_char , mut reader ) !
return XMLDocument {
2024-08-11 14:11:24 +08:00
version : prolog . version
2023-11-06 13:14:30 +00:00
encoding : prolog . encoding
comments : prolog . comments
2024-08-11 14:11:24 +08:00
doctype : prolog . doctype
root : root
2023-11-06 13:14:30 +00:00
}
}