66 #include "html-strings.h"
69 #include "htmlparser.h"
75 #define PRINTF(x) printf x
93 #define ISO_a (ISO_A | 0x20)
94 #define ISO_b (ISO_B | 0x20)
95 #define ISO_e (ISO_E | 0x20)
96 #define ISO_f (ISO_F | 0x20)
97 #define ISO_g (ISO_G | 0x20)
98 #define ISO_h (ISO_H | 0x20)
99 #define ISO_i (ISO_I | 0x20)
100 #define ISO_l (ISO_L | 0x20)
101 #define ISO_m (ISO_M | 0x20)
102 #define ISO_p (ISO_P | 0x20)
103 #define ISO_r (ISO_R | 0x20)
104 #define ISO_t (ISO_T | 0x20)
109 #define ISO_space 0x20
110 #define ISO_bang 0x21
111 #define ISO_citation 0x22
112 #define ISO_ampersand 0x26
113 #define ISO_citation2 0x27
114 #define ISO_asterisk 0x2a
115 #define ISO_dash 0x2d
116 #define ISO_slash 0x2f
117 #define ISO_semicolon 0x3b
122 #define MINORSTATE_NONE 0
123 #define MINORSTATE_TEXT 1
124 #define MINORSTATE_EXTCHAR 2
125 #define MINORSTATE_TAG 3
126 #define MINORSTATE_TAGEND 4
127 #define MINORSTATE_TAGATTR 5
128 #define MINORSTATE_TAGATTRSPACE 6
130 #define MINORSTATE_TAGATTRPARAM 7
131 #define MINORSTATE_TAGATTRPARAMNQ 8
133 #define MINORSTATE_HTMLCOMMENT 9
135 #define MAJORSTATE_NONE 0
136 #define MAJORSTATE_BODY 1
137 #define MAJORSTATE_LINK 2
138 #define MAJORSTATE_FORM 3
139 #define MAJORSTATE_DISCARD 4
140 #define MAJORSTATE_SCRIPT 5
142 struct htmlparser_state {
144 unsigned char minorstate;
146 unsigned char tagptr;
148 unsigned char tagattrptr;
149 char tagattrparam[WWW_CONF_MAX_URLLEN + 1];
150 unsigned char tagattrparamptr;
151 unsigned char quotechar;
152 unsigned char majorstate, lastmajorstate;
153 char linkurl[WWW_CONF_MAX_URLLEN + 1];
155 char word[WWW_CONF_WEBPAGE_WIDTH];
156 unsigned char wordlen;
159 char formaction[WWW_CONF_MAX_FORMACTIONLEN + 1];
160 unsigned char inputtype;
161 char inputname[WWW_CONF_MAX_INPUTNAMELEN + 1];
162 char inputvalue[WWW_CONF_MAX_INPUTVALUELEN + 1];
163 unsigned char inputvaluesize;
167 static struct htmlparser_state s;
170 static char last[1] = {(char)0xff};
172 static const char *tags[] = {
176 #define TAG_SLASHDIV 1
178 #define TAG_SLASHFORM 2
182 #define TAG_SLASHSCRIPT 4
184 #define TAG_SLASHSELECT 5
186 #define TAG_SLASHSTYLE 6
212 #define TAG_SCRIPT 19
214 #define TAG_SELECT 20
228 return (c == ISO_space ||
238 s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
239 s.inputname[0] = s.inputvalue[0] =
240 s.formaction[WWW_CONF_MAX_FORMACTIONLEN] =
241 s.inputname[WWW_CONF_MAX_INPUTNAMELEN] =
242 s.inputvalue[WWW_CONF_MAX_INPUTVALUELEN] = 0;
243 s.inputvaluesize = 20;
248 htmlparser_init(
void)
250 s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
251 s.minorstate = MINORSTATE_TEXT;
265 return (c & 0x1f) | 0x60;
275 s.tagattr[s.tagattrptr] = 0;
276 s.tagattrparam[s.tagattrparamptr] = 0;
280 switch_majorstate(
unsigned char newstate)
282 if(s.majorstate != newstate) {
283 PRINTF((
"Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
284 s.lastmajorstate = s.majorstate;
285 s.majorstate = newstate;
290 add_char(
unsigned char c)
292 if(s.wordlen < WWW_CONF_WEBPAGE_WIDTH - 1 && c < 0x80) {
293 s.word[s.wordlen] = c;
302 if(s.majorstate == MAJORSTATE_LINK) {
303 if(s.word[s.wordlen - 1] != ISO_space) {
306 }
else if(s.majorstate >= MAJORSTATE_DISCARD) {
309 s.word[s.wordlen] =
'\0';
310 htmlparser_word(s.word, s.wordlen);
320 htmlparser_newline();
326 static unsigned char first, last, i, tabi;
336 if((tagc == 0 || tagc == ISO_slash) && tags[first][i] == 0) {
343 while(tagc > (tags[tabi])[i] && tabi < last) {
349 while(tagc == (tags[tabi])[i] && tabi < last) {
358 }
while(last != first);
365 static char *tagattrparam;
366 static unsigned char tag;
367 static unsigned char size;
369 tag = find_tag(s.tag);
372 if(s.majorstate == MAJORSTATE_SCRIPT && tag != TAG_SLASHSCRIPT) {
376 PRINTF((
"Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam));
393 if(s.tagattr[0] == 0) {
395 add_char(ISO_asterisk);
400 switch_majorstate(MAJORSTATE_SCRIPT);
404 switch_majorstate(MAJORSTATE_DISCARD);
406 case TAG_SLASHSCRIPT:
408 case TAG_SLASHSELECT:
410 switch_majorstate(s.lastmajorstate);
413 s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
416 if(strncmp(s.tagattr, html_alt,
sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) {
418 tagattrparam = &s.tagattrparam[0];
419 while(*tagattrparam) {
420 add_char(*tagattrparam);
428 PRINTF((
"A %s %s\n", s.tagattr, s.tagattrparam));
429 if(strncmp(s.tagattr, html_href,
sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) {
430 strcpy(s.linkurl, s.tagattrparam);
432 switch_majorstate(MAJORSTATE_LINK);
436 if(s.majorstate == MAJORSTATE_LINK) {
437 switch_majorstate(s.lastmajorstate);
438 s.word[s.wordlen] = 0;
439 htmlparser_link(s.word, s.wordlen, s.linkurl);
447 if(s.tagattr[0] == 0 && s.formaction[0] != 0) {
448 htmlparser_form(s.formaction);
451 PRINTF((
"Form tag\n"));
452 switch_majorstate(MAJORSTATE_FORM);
453 if(strncmp(s.tagattr, html_action,
sizeof(html_action)) == 0) {
454 PRINTF((
"Form action '%s'\n", s.tagattrparam));
455 strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
460 switch_majorstate(MAJORSTATE_BODY);
464 if(s.majorstate == MAJORSTATE_FORM) {
467 if(s.tagattr[0] == 0 && s.inputname[0] != 0) {
468 PRINTF((
"Render input type %d\n", s.inputtype));
469 switch(s.inputtype) {
470 case HTMLPARSER_INPUTTYPE_NONE:
471 case HTMLPARSER_INPUTTYPE_TEXT:
472 case HTMLPARSER_INPUTTYPE_HIDDEN:
473 htmlparser_inputfield(s.inputtype, s.inputvaluesize, s.inputvalue, s.inputname);
475 case HTMLPARSER_INPUTTYPE_SUBMIT:
476 case HTMLPARSER_INPUTTYPE_IMAGE:
477 htmlparser_submitbutton(s.inputvalue, s.inputname);
482 PRINTF((
"Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
483 if(strncmp(s.tagattr, html_type,
sizeof(html_type)) == 0) {
484 if(strncmp(s.tagattrparam, html_submit,
sizeof(html_submit)) == 0) {
485 s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
486 }
else if(strncmp(s.tagattrparam, html_image,
sizeof(html_image)) == 0) {
487 s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
488 }
else if(strncmp(s.tagattrparam, html_text,
sizeof(html_text)) == 0) {
489 s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
490 }
else if(strncmp(s.tagattrparam, html_hidden,
sizeof(html_hidden)) == 0) {
491 s.inputtype = HTMLPARSER_INPUTTYPE_HIDDEN;
493 s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
495 }
else if(strncmp(s.tagattr, html_name,
sizeof(html_name)) == 0) {
496 strncpy(s.inputname, s.tagattrparam, WWW_CONF_MAX_INPUTNAMELEN);
497 }
else if(strncmp(s.tagattr, html_alt,
sizeof(html_alt)) == 0 &&
498 s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
499 strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
500 }
else if(strncmp(s.tagattr, html_value,
sizeof(html_value)) == 0) {
501 strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
502 }
else if(strncmp(s.tagattr, html_size,
sizeof(html_size)) == 0) {
504 if(s.tagattrparam[0] >=
'0' &&
505 s.tagattrparam[0] <=
'9') {
506 size = s.tagattrparam[0] -
'0';
507 if(s.tagattrparam[1] >=
'0' &&
508 s.tagattrparam[1] <=
'9') {
509 size = size * 10 + (s.tagattrparam[1] -
'0');
512 if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
513 size = WWW_CONF_MAX_INPUTVALUELEN - 1;
515 s.inputvaluesize = size;
525 parse_word(
char *data, uint8_t dlen)
533 switch(s.minorstate) {
534 case MINORSTATE_TEXT:
535 for(i = 0; i < len; ++i) {
537 if(iswhitespace(c)) {
539 }
else if(c == ISO_lt) {
540 s.minorstate = MINORSTATE_TAG;
543 }
else if(c == ISO_ampersand) {
544 s.minorstate = MINORSTATE_EXTCHAR;
551 case MINORSTATE_EXTCHAR:
552 for(i = 0; i < len; ++i) {
554 if(c == ISO_semicolon) {
555 s.minorstate = MINORSTATE_TEXT;
558 }
else if(iswhitespace(c)) {
559 s.minorstate = MINORSTATE_TEXT;
571 if(s.majorstate == MAJORSTATE_SCRIPT && data[0] != ISO_slash) {
572 s.minorstate = MINORSTATE_TEXT;
580 for(i = 0; i < len; ++i) {
584 s.minorstate = MINORSTATE_TEXT;
585 s.tagattrptr = s.tagattrparamptr = 0;
589 }
else if(iswhitespace(c)) {
592 s.minorstate = MINORSTATE_TAGATTR;
599 s.tag[s.tagptr] = lowercase(c);
603 if(s.tagptr ==
sizeof(s.tag)) {
604 s.minorstate = MINORSTATE_TAGEND;
611 s.tag[0] == ISO_bang &&
612 s.tag[1] == ISO_dash &&
613 s.tag[2] == ISO_dash) {
614 PRINTF((
"Starting comment...\n"));
615 s.minorstate = MINORSTATE_HTMLCOMMENT;
622 case MINORSTATE_TAGATTR:
625 for(i = 0; i < len; ++i) {
629 s.minorstate = MINORSTATE_TEXT;
630 s.tagattrparamptr = 0;
637 }
else if(iswhitespace(c)) {
638 if(s.tagattrptr == 0) {
642 s.tagattrparamptr = 0;
645 s.minorstate = MINORSTATE_TAGATTRSPACE;
648 }
else if(c == ISO_eq) {
649 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
650 s.tagattrparamptr = 0;
654 s.tagattr[s.tagattrptr] = lowercase(c);
658 if(s.tagattrptr ==
sizeof(s.tagattr)) {
659 s.minorstate = MINORSTATE_TAGEND;
665 case MINORSTATE_TAGATTRSPACE:
666 for(i = 0; i < len; ++i) {
668 if(iswhitespace(c)) {
670 }
else if(c == ISO_eq) {
671 s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
672 s.tagattrparamptr = 0;
677 s.tagattr[0] = lowercase(c);
679 s.minorstate = MINORSTATE_TAGATTR;
684 case MINORSTATE_TAGATTRPARAMNQ:
687 for(i = 0; i < len; ++i) {
693 s.minorstate = MINORSTATE_TEXT;
700 }
else if(iswhitespace(c) && s.tagattrparamptr == 0) {
702 }
else if((c == ISO_citation ||
703 c == ISO_citation2) && s.tagattrparamptr == 0) {
704 s.minorstate = MINORSTATE_TAGATTRPARAM;
706 PRINTF((
"tag attr param q found\n"));
708 }
else if(iswhitespace(c)) {
709 PRINTF((
"Non-leading space found at %d\n", s.tagattrparamptr));
714 s.minorstate = MINORSTATE_TAGATTR;
719 s.tagattrparam[s.tagattrparamptr] = c;
723 if(s.tagattrparamptr >=
sizeof(s.tagattrparam) - 1) {
724 s.minorstate = MINORSTATE_TAGEND;
730 case MINORSTATE_TAGATTRPARAM:
733 for(i = 0; i < len; ++i) {
735 if(c == s.quotechar) {
740 s.minorstate = MINORSTATE_TAGATTR;
745 if(iswhitespace(c)) {
746 s.tagattrparam[s.tagattrparamptr] = ISO_space;
748 s.tagattrparam[s.tagattrparamptr] = c;
754 if(s.tagattrparamptr >=
sizeof(s.tagattrparam) - 1) {
755 s.minorstate = MINORSTATE_TAGEND;
761 case MINORSTATE_HTMLCOMMENT:
762 for(i = 0; i < len; ++i) {
766 }
else if(c == ISO_gt && s.tagptr > 0) {
767 PRINTF((
"Comment done.\n"));
768 s.minorstate = MINORSTATE_TEXT;
775 case MINORSTATE_TAGEND:
777 for(i = 0; i < len; ++i) {
778 if(data[i] == ISO_gt) {
779 s.minorstate = MINORSTATE_TEXT;
798 htmlparser_parse(
char *data, uint16_t datalen)
804 plen = parse_word(data, 255);
806 plen = parse_word(data, (uint8_t)datalen);