-module(ys_parse_simple).
-export([
parse_content/2,
parse_Misc/2,
parse_element/2,
parse_XMLDecl/2
]).
-export([
event_endDocument/1,
event_endElement/2
]).
-include("yaccety_sax_simple.hrl").
-define(APPEND(Thing, Acc), append(Thing, Acc)).
-define(ACC(Stream, Pos, Len, Acc), ?APPEND(binary_part(Stream, Pos, Len), Acc)).
-define(MATCH, Bytes, Stream, Pos, State).
-define(MATCH1, Bytes1, Stream1, Pos1, State1).
-define(MATCH2, Bytes2, Stream2, Pos2, State2).
-define(MATCH3, Bytes3, Stream3, Pos3, State3).
-define(MATCH4, Bytes4, Stream4, Pos4, State4).
-define(MATCH5, Bytes5, Stream5, Pos5, State5).
-define(MATCH6, Bytes6, Stream6, Pos6, State6).
-define(MATCH7, Bytes7, Stream7, Pos7, State7).
-define(WS(Char), (Char == 16#20 orelse Char == 16#9 orelse Char == 16#A orelse Char == 16#D)).
-compile({inline, [append/2, set_state_pos/2, to_binary/1, fatal_error/2]}).
append(<<>>, Acc) -> Acc;
append(Thing, []) -> Thing;
append(Thing, Acc) -> [Acc, Thing].
%%----------------------------------------------------------------------
%% XML character range
%% [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
%% [#x10000-#x10FFFF]
%% any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
%%----------------------------------------------------------------------
-define(ONECHAR,
?FUNCTION_NAME(<<16#9, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 1, State, Acc);
?FUNCTION_NAME(<<16#A, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 1, State, Acc);
?FUNCTION_NAME(<<Char, _/bitstring>>, _, _, _, State, _) when Char < 16#20 ->
fatal_error({invalid_character, {?LINE, [Char]}}, State);
?FUNCTION_NAME(<<Char, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when Char < 16#80 ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 1, State, Acc);
?FUNCTION_NAME(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
if
Char < 16#800 ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 2, State, Acc);
Char == 16#FFFE; Char == 16#FFFF ->
fatal_error({invalid_character, {?LINE, [Char]}}, State);
Char < 16#10000 ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 3, State, Acc);
true ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 4, State, Acc)
end;
?FUNCTION_NAME(Bytes, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{to_binary(Acc1), Bytes, Stream, Pos + Len, State}
).
%%----------------------------------------------------------------------
%% Consume whitespace characters
%% params: State
%% returns: {NewPos, NewState} | {error, non_whitespace}
%% [3] S ::= (#x20 | #x9 | #xD | #xA)+
%%----------------------------------------------------------------------
maybe_consume_s(<<Char, Rest/bitstring>>, Stream, Pos, State) when
Char == 16#20; Char == 16#9; Char == 16#D; Char == 16#A
->
maybe_consume_s(Rest, Stream, Pos + 1, State, true);
maybe_consume_s(Part, Stream, Pos, State) ->
{false, Part, Stream, Pos, State}.
maybe_consume_s(<<Char, Rest/bitstring>>, Stream, Pos, State, _) when
Char == 16#20; Char == 16#9; Char == 16#D; Char == 16#A
->
maybe_consume_s(Rest, Stream, Pos + 1, State, true);
maybe_consume_s(Part, Stream, Pos, State, Found) ->
{Found, Part, Stream, Pos, State}.
%%----------------------------------------------------------------------
%% Parse Name (does not split prefix and local parts like NCName)
%% params: State
%% returns: {Name, NewState}
%% [4] NameStartChar
%% ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D]
%% | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
%% | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
%% [5] Name ::= NameStartChar (NameChar)*
%%----------------------------------------------------------------------
-define(NAMECHAR1(Char),
(Char == $: orelse (Char >= $A andalso Char =< $Z) orelse Char == $_ orelse
(Char >= $a andalso Char =< $z))
).
-define(NAMECHAR(Char),
(Char == $- orelse
Char == $. orelse
(Char >= $0 andalso Char =< $9) orelse
Char == $: orelse (Char >= $A andalso Char =< $Z) orelse Char == $_ orelse
(Char >= $a andalso Char =< $z))
).
parse_Name(<<Char1, Char2, Char3, Char4, Rest/bitstring>>, Stream, Pos, State) when
?NAMECHAR1(Char1), ?NAMECHAR(Char2), ?NAMECHAR(Char3), ?NAMECHAR(Char4)
->
parse_Name(Rest, Stream, Pos, 4, State, []);
parse_Name(<<Char1, Char2, Char3, Rest/bitstring>>, Stream, Pos, State) when
?NAMECHAR1(Char1), ?NAMECHAR(Char2), ?NAMECHAR(Char3)
->
parse_Name(Rest, Stream, Pos, 3, State, []);
parse_Name(<<Char1, Char2, Rest/bitstring>>, Stream, Pos, State) when
?NAMECHAR1(Char1), ?NAMECHAR(Char2)
->
parse_Name(Rest, Stream, Pos, 2, State, []);
parse_Name(<<Char, Rest/bitstring>>, Stream, Pos, State) when ?NAMECHAR1(Char) ->
parse_Name(Rest, Stream, Pos, 1, State, []);
parse_Name(<<Char/utf8, Rest/bitstring>>, Stream, Pos, State) ->
if
Char >= 16#C0, Char =< 16#D6;
Char >= 16#D8, Char =< 16#F6;
Char >= 16#F8, Char =< 16#2FF;
Char >= 16#370, Char =< 16#37D;
Char >= 16#37F, Char =< 16#7FF ->
parse_Name(Rest, Stream, Pos, 2, State, []);
Char >= 16#800, Char =< 16#1FFF;
Char >= 16#200C, Char =< 16#200D;
Char >= 16#2070, Char =< 16#218F;
Char >= 16#2C00, Char =< 16#2FEF;
Char >= 16#3001, Char =< 16#D7FF;
Char >= 16#F900, Char =< 16#FDCF;
Char >= 16#FDF0, Char =< 16#FFFD ->
parse_Name(Rest, Stream, Pos, 3, State, []);
Char >= 16#10000, Char =< 16#EFFFF ->
parse_Name(Rest, Stream, Pos, 4, State, []);
true ->
fatal_error(bad_name, {[Char], State})
end;
parse_Name(Bytes, _, _, State) ->
fatal_error(bad_name, {Bytes, State}).
parse_Name(<<Char1, Char2, Char3, Char4, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?NAMECHAR(Char1), ?NAMECHAR(Char2), ?NAMECHAR(Char3), ?NAMECHAR(Char4)
->
parse_Name(Rest, Stream, Pos, Len + 4, State, Acc);
parse_Name(<<Char1, Char2, Char3, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?NAMECHAR(Char1), ?NAMECHAR(Char2), ?NAMECHAR(Char3)
->
parse_Name(Rest, Stream, Pos, Len + 3, State, Acc);
parse_Name(<<Char1, Char2, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?NAMECHAR(Char1), ?NAMECHAR(Char2)
->
parse_Name(Rest, Stream, Pos, Len + 2, State, Acc);
parse_Name(<<Char, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when ?NAMECHAR(Char) ->
parse_Name(Rest, Stream, Pos, Len + 1, State, Acc);
parse_Name(<<Char, _/bitstring>> = Bytes, Stream, Pos, Len, State, Acc) when Char < 16#80 ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{to_binary(Acc1), Bytes, Stream, Pos + Len, State};
parse_Name(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char == 16#B7;
Char >= 16#C0, Char =< 16#D6;
Char >= 16#D8, Char =< 16#F6;
Char >= 16#F8, Char =< 16#37D;
Char >= 16#37F, Char =< 16#7FF
->
parse_Name(Rest, Stream, Pos, Len + 2, State, Acc);
parse_Name(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= 16#800, Char =< 16#1FFF;
Char >= 16#200C, Char =< 16#200D;
Char >= 16#203F, Char =< 16#2040;
Char >= 16#2070, Char =< 16#218F;
Char >= 16#2C00, Char =< 16#2FEF;
Char >= 16#3001, Char =< 16#D7FF;
Char >= 16#F900, Char =< 16#FDCF;
Char >= 16#FDF0, Char =< 16#FFFD
->
parse_Name(Rest, Stream, Pos, Len + 3, State, Acc);
parse_Name(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= 16#10000, Char =< 16#EFFFF
->
parse_Name(Rest, Stream, Pos, Len + 4, State, Acc);
parse_Name(Bytes, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{to_binary(Acc1), Bytes, Stream, Pos + Len, State}.
%%----------------------------------------------------------------------
%% [67] Reference ::= EntityRef | CharRef
%% & char is removed
%% [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
%% [WFC: Legal Character]
%% [68] EntityRef ::= '&' Name ';'
%% [WFC: Entity Declared]
%% [WFC: Parsed Entity]
%% [WFC: No Recursion]
%% can be EntityRef | CharRef
%%----------------------------------------------------------------------
parse_Reference(<<$#, $x, Rest/bitstring>>, Stream, Pos, State, _Type) ->
parse_Reference_hex(Rest, Stream, Pos + 2, 0, State, []);
parse_Reference(<<$#, Rest/bitstring>>, Stream, Pos, State, _Type) ->
parse_Reference_dec(Rest, Stream, Pos + 1, 0, State, []);
parse_Reference(?MATCH, Type) ->
parse_Reference_name(?MATCH, Type).
% hex parse until ';' return char
parse_Reference_hex(<<$;, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
try
case binary_to_integer(to_binary(Acc1), 16) of
Int when
Int == 16#9;
Int == 16#A;
Int == 16#D;
Int >= 16#20, Int =< 16#D7FF;
Int >= 16#E000, Int =< 16#FFFD;
Int >= 16#10000, Int =< 16#10FFFF
->
{{hex, <<Int/utf8>>}, Rest, Stream, Pos + Len + 1, State}
end
catch
_:_ ->
fatal_error(bad_charref, State)
end;
parse_Reference_hex(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= $0, Char =< $9; Char >= $a, Char =< $f; Char >= $A, Char =< $F
->
parse_Reference_hex(Rest, Stream, Pos, Len + 1, State, Acc);
parse_Reference_hex(_, _, _, _, State, _) ->
fatal_error(bad_charref, State).
% decimal parse until ';' return char
parse_Reference_dec(<<$;/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
try
case binary_to_integer(to_binary(Acc1)) of
Int when
Int == 16#9;
Int == 16#A;
Int == 16#D;
Int >= 16#20, Int =< 16#D7FF;
Int >= 16#E000, Int =< 16#FFFD;
Int >= 16#10000, Int =< 16#10FFFF
->
{{dec, <<Int/utf8>>}, Rest, Stream, Pos + Len + 1, State}
end
catch
_:_ ->
fatal_error(bad_charref, {Acc1, State})
end;
parse_Reference_dec(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= $0, Char =< $9
->
parse_Reference_dec(Rest, Stream, Pos, Len + 1, State, Acc);
parse_Reference_dec(_, _, _, _, State, _) ->
fatal_error(bad_charref, State).
% Name parse until ';' return Name
parse_Reference_name(?MATCH, Type) ->
case parse_Name(?MATCH) of
{Name, <<$;/utf8, Bytes1/bitstring>>, Stream1, Pos1, State1} ->
NameValue = resolve_general_entity(Name, State1, Type),
{NameValue, Bytes1, Stream1, Pos1 + 1, State1};
{_, _, _, _, State1} ->
fatal_error(bad_charref, State1)
end.
%%----------------------------------------------------------------------
%% Parse a Comment, leading '<!--' already removed
%% params: State
%% returns: {Comment, NewState} | NewState (when ignoring)
%% [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
%%----------------------------------------------------------------------
parse_Comment(?MATCH) ->
{_, _, _, _, State1} = parse_Comment(Bytes, Stream, Pos, 0, State, []),
State1.
parse_Comment(<<$-/utf8, $-/utf8, $>/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{Acc1, Rest, Stream, Pos + Len + 3, State};
parse_Comment(<<$-/utf8, $-/utf8, _/bitstring>>, _, _, _, State, _Acc) ->
fatal_error(bad_comment, State);
?ONECHAR.
%%----------------------------------------------------------------------
%% Parse character data. In content, everything that is not
%% (element | Reference | CDSect | PI | Comment)
%% params: State
%% returns: {CharData, IsWs, NewState}
%% [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
%%----------------------------------------------------------------------
parse_CharData(?MATCH) ->
{{IsWs, Text}, Bytes1, _, _, State1} = parse_CharData_ws(Bytes, Stream, Pos, 0, State, []),
State2 = set_state_pos(State1, Bytes1),
case IsWs of
true ->
State2;
_ ->
event_characters(to_binary(Text), State2)
end.
parse_CharData(?MATCH, Acc) ->
{{_, Text}, Bytes1, _, _, State1} = parse_CharData(Bytes, Stream, Pos, 0, State, Acc),
event_characters(to_binary(Text), set_state_pos(State1, Bytes1)).
parse_CharData(Bytes = <<$</utf8, _/bitstring>>, Stream, Pos, Len, State, Acc) ->
Text = ?ACC(Stream, Pos, Len, Acc),
{{false, Text}, Bytes, Stream, Pos + Len, State};
parse_CharData(<<$&/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{_, Ref}, ?MATCH1} = parse_Reference(Rest, Stream, Pos + Len + 1, State, text),
Acc2 = ?APPEND(Ref, Acc1),
?FUNCTION_NAME(Bytes1, Stream1, Pos1, 0, State1, Acc2);
parse_CharData(<<"]]>"/utf8, _/bitstring>>, _, _, _, State, _Acc) ->
fatal_error(bad_char_data, State);
?ONECHAR.
parse_CharData_ws(<<Char1, Char2, Char3, Char4, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?WS(Char1), ?WS(Char2), ?WS(Char3), ?WS(Char4)
->
parse_CharData_ws(Rest, Stream, Pos, Len + 4, State, Acc);
parse_CharData_ws(<<Char1, Char2, Char3, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?WS(Char1), ?WS(Char2), ?WS(Char3)
->
parse_CharData_ws(Rest, Stream, Pos, Len + 3, State, Acc);
parse_CharData_ws(<<Char1, Char2, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
?WS(Char1), ?WS(Char2)
->
parse_CharData_ws(Rest, Stream, Pos, Len + 2, State, Acc);
parse_CharData_ws(<<Char, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when ?WS(Char) ->
parse_CharData_ws(Rest, Stream, Pos, Len + 1, State, Acc);
parse_CharData_ws(Bytes = <<$<, _/bitstring>>, Stream, Pos, Len, State, Acc) ->
Text = ?ACC(Stream, Pos, Len, Acc),
{{true, Text}, Bytes, Stream, Pos + Len, State};
parse_CharData_ws(<<$&, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{_, Ref}, ?MATCH1} = parse_Reference(Rest, Stream, Pos + Len + 1, State, text),
Acc2 = ?APPEND(Ref, Acc1),
parse_CharData(Bytes1, Stream1, Pos1, 0, State1, Acc2);
parse_CharData_ws(Bytes, Stream, Pos, Len, State, Acc) ->
parse_CharData(Bytes, Stream, Pos, Len, State, Acc).
%%----------------------------------------------------------------------
%% [18] CDSect ::= CDStart CData CDEnd
%% [19] CDStart ::= '<![CDATA['
%% [20] CData ::= (Char* - (Char* ']]>' Char*))
%% [21] CDEnd ::= ']]>'
%% Parse CDATA Section. '<![' is already matched.
%% params: State
%% returns: {CharData, IsWs, NewState}
%%----------------------------------------------------------------------
parse_CDSect(<<"CDATA["/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_CData(Rest, Stream, Pos + 6, State);
parse_CDSect(_, _, _, State) ->
fatal_error(bad_cdata, State).
parse_CData(Rest, Stream, Pos, State) ->
{Text, Bytes1, _, _, State1} = parse_CData(Rest, Stream, Pos, 0, State, []),
event_characters(to_binary(Text), set_state_pos(State1, Bytes1)).
parse_CData(<<"]]>"/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Text = ?ACC(Stream, Pos, Len, Acc),
{Text, Rest, Stream, Pos + Len + 3, State};
?ONECHAR.
%%----------------------------------------------------------------------
%% [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
%%----------------------------------------------------------------------
parse_EncodingDecl(?MATCH) ->
parse_EncodingDeclS(?MATCH).
parse_EncodingDeclS(<<"encoding"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_EncodingDecl_encoding(Rest, Stream, Pos + 8, State);
parse_EncodingDeclS(?MATCH) ->
% no encoding
{{<<"UTF-8">>, false}, ?MATCH}.
parse_EncodingDecl_encoding(?MATCH) ->
{?MATCH1} = parse_Eq(?MATCH),
parse_EncodingDecl_EncName(?MATCH1).
parse_EncodingDecl_EncName(<<$'/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_EncodingDecl_EncName_sq(Rest, Stream, Pos + 1, State);
parse_EncodingDecl_EncName(<<$"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_EncodingDecl_EncName_dq(Rest, Stream, Pos + 1, State);
parse_EncodingDecl_EncName(_, _, _, State) ->
fatal_error(bad_encoding, State).
parse_EncodingDecl_EncName_sq(?MATCH) ->
case parse_EncodingDecl_EncName_name(?MATCH) of
{Name, <<$'/utf8, Rest/bitstring>>, Stream1, Pos1, State1} ->
{{Name, true}, Rest, Stream1, Pos1 + 1, State1};
{_, _, _, _, State1} ->
fatal_error(bad_encoding, State1)
end.
parse_EncodingDecl_EncName_dq(?MATCH) ->
case parse_EncodingDecl_EncName_name(?MATCH) of
{Name, <<$"/utf8, Rest/bitstring>>, Stream1, Pos1, State1} ->
{{Name, true}, Rest, Stream1, Pos1 + 1, State1};
{_, _, _, _, State1} ->
fatal_error(bad_encoding, State1)
end.
%%----------------------------------------------------------------------
%% [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
%% /* Encoding name contains only Latin characters */
%%----------------------------------------------------------------------
parse_EncodingDecl_EncName_name(<<Char/utf8, Rest/bitstring>>, Stream, Pos, State) when
Char >= $A andalso Char =< $Z; Char >= $a andalso Char =< $z
->
parse_EncodingDecl_EncName_name_1(Rest, Stream, Pos, 1, State, []);
parse_EncodingDecl_EncName_name(_, _, _, State) ->
fatal_error(bad_char, State).
parse_EncodingDecl_EncName_name_1(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= $A andalso Char =< $Z;
Char >= $a andalso Char =< $z;
Char >= $0 andalso Char =< $9;
Char == $.;
Char == $_;
Char == $-
->
parse_EncodingDecl_EncName_name_1(Rest, Stream, Pos, Len + 1, State, Acc);
parse_EncodingDecl_EncName_name_1(Bytes, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{Acc1, Bytes, Stream, Pos + Len, State}.
%%----------------------------------------------------------------------
%% [25] Eq ::= S? '=' S?
%%----------------------------------------------------------------------
parse_Eq(<<$=, Rest/bitstring>>, Stream, Pos, State) ->
{_Found, ?MATCH1} = maybe_consume_s(Rest, Stream, Pos + 1, State),
{?MATCH1};
parse_Eq(?MATCH) ->
{_Found, ?MATCH1} = maybe_consume_s(?MATCH),
parse_Eq_1(?MATCH1).
parse_Eq_1(<<$=, Rest/bitstring>>, Stream, Pos, State) ->
{_Found, ?MATCH1} = maybe_consume_s(Rest, Stream, Pos + 1, State),
{?MATCH1};
parse_Eq_1(_, _, _, State) ->
fatal_error(bad_eq, State).
%%----------------------------------------------------------------------
%% [26] VersionNum ::= '1.' [0-9]+
%%----------------------------------------------------------------------
parse_VersionNum_sq(<<$1/utf8, $./utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_VersionNum_digit_sq(Rest, Stream, Pos + 2, 0, State, []);
parse_VersionNum_sq(_, _, _, State) ->
fatal_error(bad_version_num, State).
parse_VersionNum_digit_sq(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= $0 andalso Char =< $9
->
parse_VersionNum_digit_sq(Rest, Stream, Pos, Len + 1, State, Acc);
parse_VersionNum_digit_sq(<<$'/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{version_to_number(Acc1, State), true}, Rest, Stream, Pos + Len + 1, State};
parse_VersionNum_digit_sq(_, _, _, _, State, _) ->
fatal_error(bad_version_num, State).
parse_VersionNum_dq(<<$1/utf8, $./utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_VersionNum_digit_dq(Rest, Stream, Pos + 2, 0, State, []);
parse_VersionNum_dq(_, _, _, State) ->
fatal_error(bad_version_num, State).
parse_VersionNum_digit_dq(<<Char/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when
Char >= $0 andalso Char =< $9
->
parse_VersionNum_digit_dq(Rest, Stream, Pos, Len + 1, State, Acc);
parse_VersionNum_digit_dq(<<$\"/utf8, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{version_to_number(Acc1, State), true}, Rest, Stream, Pos + Len + 1, State};
parse_VersionNum_digit_dq(_, _, _, _, State, _) ->
fatal_error(bad_version_num, State).
version_to_number(Acc, State) ->
case to_binary([<<"1."/utf8>> | Acc]) of
<<"1."/utf8>> -> fatal_error(bad_version_num, State);
Bin -> Bin
end.
%%----------------------------------------------------------------------
%% [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
%%----------------------------------------------------------------------
parse_VersionInfo(?MATCH) ->
{Found, ?MATCH1} = maybe_consume_s(?MATCH),
case Found of
true ->
parse_VersionInfoS(?MATCH1);
false ->
{{<<>>, false}, ?MATCH1}
end.
parse_VersionInfo_version(?MATCH) ->
case parse_Eq(?MATCH) of
{<<$'/utf8, Bytes1/bitstring>>, Stream1, Pos1, State1} ->
parse_VersionNum_sq(Bytes1, Stream1, Pos1 + 1, State1);
{<<$\"/utf8, Bytes1/bitstring>>, Stream1, Pos1, State1} ->
parse_VersionNum_dq(Bytes1, Stream1, Pos1 + 1, State1);
{_, _, _, State1} ->
fatal_error(bad_version, State1)
end.
parse_VersionInfoS(<<"version"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_VersionInfo_version(Rest, Stream, Pos + 7, State);
parse_VersionInfoS(?MATCH) ->
{{<<>>, false}, ?MATCH}.
%%----------------------------------------------------------------------
%% [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
%% leading space is trimmed already
%%----------------------------------------------------------------------
parse_SDDecl_standalone(?MATCH) ->
{?MATCH1} = parse_Eq(?MATCH),
parse_SDDecl_standalone_yesno(?MATCH1).
parse_SDDecl_standalone_yesno(<<$'/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_SDDecl_standalone_yesno_sq(Rest, Stream, Pos + 1, State);
parse_SDDecl_standalone_yesno(<<$"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_SDDecl_standalone_yesno_dq(Rest, Stream, Pos + 1, State);
parse_SDDecl_standalone_yesno(_, _, _, State) ->
fatal_error(bad_standalone, State).
parse_SDDecl_standalone_yesno_sq(<<"no'"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
{{false, true}, Rest, Stream, Pos + 3, State};
parse_SDDecl_standalone_yesno_sq(<<"yes'"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
{{true, true}, Rest, Stream, Pos + 4, State}.
parse_SDDecl_standalone_yesno_dq(<<"no\""/utf8, Rest/bitstring>>, Stream, Pos, State) ->
{{false, true}, Rest, Stream, Pos + 3, State};
parse_SDDecl_standalone_yesno_dq(<<"yes\""/utf8, Rest/bitstring>>, Stream, Pos, State) ->
{{true, true}, Rest, Stream, Pos + 4, State}.
parse_SDDecl(<<"standalone"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_SDDecl_standalone(Rest, Stream, Pos + 10, State);
parse_SDDecl(?MATCH) ->
{{false, false}, ?MATCH}.
%%----------------------------------------------------------------------
%% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
%%----------------------------------------------------------------------
parse_XMLDecl_ltqxml(?MATCH) ->
{{Version, true}, ?MATCH1} = parse_VersionInfo(?MATCH),
{IsWs2, ?MATCH2} = maybe_consume_s(?MATCH1),
{{Encoding, EncSet}, ?MATCH3} = parse_EncodingDecl(?MATCH2),
{{_, _}, ?MATCH6} =
case IsWs2 of
true when EncSet ->
{IsWs4, ?MATCH4} = maybe_consume_s(?MATCH3),
{{Standalone, SaSet}, ?MATCH5} = parse_SDDecl(?MATCH4),
case IsWs4 of
false when SaSet -> fatal_error(missing_ws, State);
_ -> {{Standalone, SaSet}, ?MATCH5}
end;
true ->
parse_SDDecl(?MATCH3);
false when EncSet ->
fatal_error(missing_ws, State);
false ->
{IsWs4, ?MATCH4} = maybe_consume_s(?MATCH3),
{{Standalone, SaSet}, ?MATCH5} = parse_SDDecl(?MATCH4),
case IsWs4 of
false when SaSet -> fatal_error(missing_ws, State);
_ -> {{Standalone, SaSet}, ?MATCH5}
end
end,
{_, ?MATCH7} = maybe_consume_s(?MATCH6),
{Bytes8, _, _, State8} = parse_XMLDecl_end(?MATCH7),
State9 = set_next_parser_position(?misc_post_dtd, State8),
event_startDocument(
Version,
Encoding,
set_state_pos(State9, Bytes8)
).
parse_XMLDecl_end(<<"?>"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
{Rest, Stream, Pos + 2, State};
parse_XMLDecl_end(Bytes, _, _, State) ->
fatal_error(bad_xmldecl, {Bytes, State}).
parse_XMLDecl(Stream, State) -> parse_XMLDecl(Stream, Stream, 0, State).
parse_XMLDecl(<<"<?xml"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_XMLDecl_ltqxml(Rest, Stream, Pos + 5, State);
parse_XMLDecl(Bytes, _, _, State) ->
% default declaration
State1 = set_next_parser_position(?misc_post_dtd, State),
event_startDocument(
<<"1.0">>,
<<"UTF-8">>,
set_state_pos(State1, Bytes)
).
%%----------------------------------------------------------------------
%% [27] Misc ::= Comment | PI | S
%%----------------------------------------------------------------------
parse_Misc(Stream, State) -> parse_Misc(Stream, Stream, 0, State).
parse_Misc(<<"<!--"/utf8, Rest/bitstring>>, Stream, Pos, State) ->
parse_Comment(Rest, Stream, Pos + 4, State);
parse_Misc(<<>>, _, _, State) ->
{no_bytes, State};
parse_Misc(?MATCH) ->
case maybe_consume_s(?MATCH) of
{true, ?MATCH1} ->
parse_Misc(?MATCH1);
{false, Bytes1, _, _, State1} ->
set_state_pos(State1, Bytes1)
end.
%%----------------------------------------------------------------------
%% [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
%%----------------------------------------------------------------------
parse_content(Stream, State) -> parse_content(Stream, Stream, 0, State).
parse_content(<<"</", Rest/bitstring>>, Stream, Pos, State) ->
parse_ETag(Rest, Stream, Pos + 2, State);
parse_content(<<"<!--", Rest/bitstring>>, Stream, Pos, State) ->
parse_Comment(Rest, Stream, Pos + 4, State);
parse_content(<<"<![", Rest/bitstring>>, Stream, Pos, State) ->
parse_CDSect(Rest, Stream, Pos + 3, State);
parse_content(<<$<, Rest/bitstring>>, Stream, Pos, State) ->
parse_element_lt(Rest, Stream, Pos + 1, State);
parse_content(<<$&, Rest/bitstring>>, Stream, Pos, State) ->
{{_, Ref}, ?MATCH1} = parse_Reference(Rest, Stream, Pos + 1, State, content),
parse_CharData(?MATCH1, Ref);
parse_content(?MATCH) ->
parse_CharData(?MATCH).
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%
%% %%
%% Namespace Stuff %%
%% %%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%
%%----------------------------------------------------------------------
%% [39] element ::= EmptyElemTag | STag content ETag
%% [WFC: Element Type Match]
%% [40] STag ::= '<' Name (S Attribute)* S? '>'
%% [WFC: Unique Att Spec]
%% [12ns] STag ::= '<' QName (S Attribute)* S? '>'
%% [NSC: Prefix Declared]
%% [WFC: Unique Att Spec]
%% [14ns] EmptyElemTag ::= '<' QName (S Attribute)* S? '/>'
%% [NSC: Prefix Declared]
%% [WFC: Unique Att Spec]
%%----------------------------------------------------------------------
parse_element(Stream, State) -> parse_element(Stream, Stream, 0, State).
parse_element(<<$<, Rest/bitstring>>, Stream, Pos, State) ->
parse_element_lt(Rest, Stream, Pos + 1, State);
parse_element(B, _, _, State) ->
fatal_error(non_element, {B, State}).
parse_element_lt(
Bytes,
Stream,
Pos,
#ys_state_simple{
tags = Tags,
position = P
} = State
) ->
{Name, ?MATCH1} = parse_Name(?MATCH),
case Bytes1 of
<<$>, Bytes2/bitstring>> ->
State2 = #ys_state_simple{
position = [?content | P],
tags = [Name | Tags],
rest_stream = Bytes2
},
event_startElement(Name, [], State2);
<<$/, Rest/bitstring>> ->
parse_element_empty(
Rest,
Stream1,
Pos1 + 1,
State1,
Name,
[],
P,
Tags
);
_ ->
{As, ?MATCH2} = parse_attributes(?MATCH1, [], Name),
case Bytes2 of
<<$>, Bytes3/bitstring>> ->
State3 = #ys_state_simple{
position = [?content | P],
tags = [Name | Tags],
rest_stream = Bytes3
},
event_startElement(Name, As, State3);
<<$/, Rest/bitstring>> ->
parse_element_empty(
Rest,
Stream2,
Pos2 + 1,
State2,
Name,
As,
P,
Tags
);
_ ->
fatal_error(bad_element, State2)
end
end.
% misc_post_element
parse_element_empty(<<$>, Bytes/bitstring>>, _, _, State, QName, Ats, P, Tags) ->
Pss =
case P of
% Empty root element
[?element | Ps] -> [?empty, ?misc_post_element | Ps];
_ -> [?empty | P]
end,
State1 = State#ys_state_simple{
position = Pss,
tags = [QName | Tags],
rest_stream = Bytes
},
event_startElement(QName, Ats, State1);
parse_element_empty(_, _, _, State, _, _, _, _) ->
fatal_error(bad_element, State).
%%----------------------------------------------------------------------
%% [41] Attribute ::= Name Eq AttValue
%% [WFC: No External Entity References]
%% [WFC: No < in Attribute Values]
%% Initial S is trimmed, first character is not ">" or "/"
%% [1ns] NSAttName ::= PrefixedAttName | DefaultAttName
%% [2ns] PrefixedAttName ::= 'xmlns:' NCName
%% [NSC: Reserved Prefixes and Namespace Names]
%% [3ns] DefaultAttName ::= 'xmlns'
%% [15ns] Attribute ::= NSAttName Eq AttValue | QName Eq AttValue
%% [NSC: Prefix Declared]
%% [NSC: No Prefix Undeclaring]
%% [NSC: Attributes Unique]
%% [VC: Attribute Value Type]
%%----------------------------------------------------------------------
parse_Attribute(?MATCH) ->
{Name, ?MATCH1} = parse_Name(?MATCH),
{Bytes2, Stream2, Pos2, State2} = parse_Eq(?MATCH1),
{Value, ?MATCH3} = parse_AttValue(?MATCH2),
{{Name, Value}, ?MATCH3}.
parse_attributes(Bytes = <<C, _/bitstring>>, Stream, Pos, State, Atts, _) when C == $>; C == $/ ->
{Atts, ?MATCH};
parse_attributes(?MATCH, Atts, EName) ->
case maybe_consume_s(?MATCH) of
{_, <<C, _/bitstring>> = ?MATCH1} when C == $>; C == $/ ->
{Atts, ?MATCH1};
{false, ?MATCH1} ->
{Atts, ?MATCH1};
{true, ?MATCH1} ->
{AttNameVal, ?MATCH2} = parse_Attribute(?MATCH1),
case Bytes2 of
<<C, _/bitstring>> when C == $>; C == $/ ->
{[AttNameVal | Atts], ?MATCH2};
_ ->
parse_attributes(?MATCH2, [AttNameVal | Atts], EName)
end
end.
%%----------------------------------------------------------------------
%% [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
%%----------------------------------------------------------------------
parse_AttValue(<<$', Rest/bitstring>>, Stream, Pos, State) ->
parse_AttValue_sq(Rest, Stream, Pos + 1, 0, State, []);
parse_AttValue(<<$", Rest/bitstring>>, Stream, Pos, State) ->
parse_AttValue_dq(Rest, Stream, Pos + 1, 0, State, []);
parse_AttValue(_, _, _, State) ->
fatal_error(bad_attval, State).
parse_AttValue_sq(<<$', Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Value = ?ACC(Stream, Pos, Len, Acc),
{to_binary(Value), Rest, Stream, Pos + Len + 1, State};
parse_AttValue_sq(<<$<, _/bitstring>>, _, _, _, State, _Acc) ->
fatal_error(bad_attval, State);
parse_AttValue_sq(<<$&, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{_, Ref}, ?MATCH1} = parse_Reference(Rest, Stream, Pos + Len + 1, State, attribute),
Acc2 = ?APPEND(Ref, Acc1),
?FUNCTION_NAME(Bytes1, Stream1, Pos1, 0, State1, Acc2);
parse_AttValue_sq(<<Char, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when ?WS(Char) ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 1, State, Acc);
?ONECHAR.
parse_AttValue_dq(<<$", Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Value = ?ACC(Stream, Pos, Len, Acc),
{to_binary(Value), Rest, Stream, Pos + Len + 1, State};
parse_AttValue_dq(<<$<, _/bitstring>>, _, _, _, State, _Acc) ->
fatal_error(bad_attval, State);
parse_AttValue_dq(<<$&, Rest/bitstring>>, Stream, Pos, Len, State, Acc) ->
Acc1 = ?ACC(Stream, Pos, Len, Acc),
{{_, Ref}, ?MATCH1} = parse_Reference(Rest, Stream, Pos + Len + 1, State, attribute),
Acc2 = ?APPEND(Ref, Acc1),
?FUNCTION_NAME(Bytes1, Stream1, Pos1, 0, State1, Acc2);
parse_AttValue_dq(<<Char, Rest/bitstring>>, Stream, Pos, Len, State, Acc) when ?WS(Char) ->
?FUNCTION_NAME(Rest, Stream, Pos, Len + 1, State, Acc);
?ONECHAR.
%%----------------------------------------------------------------------
%% -[42] ETag ::= '</' Name S? '>'
%% +[13ns] ETag ::= '</' QName S? '>' [NSC: Prefix Declared]
%% '</' is already trimmed
%%----------------------------------------------------------------------
parse_ETag(
Bytes,
Stream,
Pos,
#ys_state_simple{position = [_, ?element | _Ps1], tags = [Tag | Ts]} =
State
) ->
{Name, ?MATCH1} = parse_Name(?MATCH),
{_, Bytes2, _, _, State2} = maybe_consume_s(?MATCH1),
case Bytes2 of
<<$>, Bytes3/bitstring>> ->
case Name of
Tag ->
State3 = #ys_state_simple{
rest_stream = Bytes3,
position = [?misc_post_element],
tags = Ts
},
event_endElement(Name, State3);
_ ->
fatal_error(unmatched_tag, {Tag, Name})
end;
_ ->
fatal_error(bad_endtag, State2)
end;
parse_ETag(
Bytes,
Stream,
Pos,
#ys_state_simple{position = [_ | Ps], tags = [Tag | Ts]} = State
) ->
{Name, ?MATCH1} = parse_Name(?MATCH),
case Bytes1 of
<<$>, Bytes2/bitstring>> ->
case Name of
Tag ->
State2 = #ys_state_simple{rest_stream = Bytes2, position = Ps, tags = Ts},
event_endElement(Name, State2);
_ ->
fatal_error(unmatched_tag, {Tag, Name})
end;
_ ->
{_, Bytes2, _, _, State2} = maybe_consume_s(?MATCH1),
case Bytes2 of
<<$>, Bytes3/bitstring>> ->
case Name of
Tag ->
State3 = #ys_state_simple{
rest_stream = Bytes3, position = Ps, tags = Ts
},
event_endElement(Name, State3);
_ ->
fatal_error(unmatched_tag, {Tag, Name})
end;
_ ->
fatal_error(bad_endtag, State2)
end
end.
set_next_parser_position(Pos, #ys_state_simple{position = S} = State) ->
State#ys_state_simple{position = [Pos | S]}.
fatal_error(Reason, State) -> error(Reason, [State]).
set_state_pos(#ys_state_simple{position = Ps, tags = Ts}, Bytes) ->
#ys_state_simple{rest_stream = Bytes, position = Ps, tags = Ts}.
resolve_general_entity(<<"amp">>, _, _) ->
{gen, <<"&">>};
resolve_general_entity(<<"lt">>, _, _) ->
{gen, <<"<">>};
resolve_general_entity(<<"gt">>, _, _) ->
{gen, <<">">>};
resolve_general_entity(<<"apos">>, _, _) ->
{gen, <<"'">>};
resolve_general_entity(<<"quot">>, _, _) ->
{gen, <<"\"">>};
resolve_general_entity(Name, _, _) ->
{gen, <<$&, Name/binary, $;>>}.
to_binary(Bin) when is_binary(Bin) -> Bin;
to_binary(IoList) -> iolist_to_binary(IoList).
%% ====================================================================
%% Events
%% ====================================================================
event_startDocument(Version, Encoding, State) ->
Event = {startDocument, Version, Encoding},
{Event, State}.
event_endDocument(State) ->
Event = endDocument,
{Event, State}.
event_startElement(QName, Attributes, State) ->
Event = {startElement, QName, Attributes},
{Event, State}.
event_endElement(QName, State) ->
Event = {endElement, QName},
{Event, State}.
event_characters(<<>>, State) ->
State;
event_characters(Data, State) ->
Event = {characters, Data},
{Event, State}.