Contiki 3.x
htmlparser.c
1 /*
2  * Copyright (c) 2002, Adam Dunkels.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following
12  * disclaimer in the documentation and/or other materials provided
13  * with the distribution.
14  * 3. The name of the author may not be used to endorse or promote
15  * products derived from this software without specific prior
16  * written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
19  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *
30  * This file is part of the Contiki desktop environment
31  *
32  *
33  */
34 
35 /* htmlparser.c:
36  *
37  * Implements a very simplistic HTML parser. It recognizes HTML links
38  * (<a href>-tags), HTML img alt tags, a few text flow break tags
39 G * (<br>, <p>, <h>), the <li> tag (but does not even try to
40  * distinguish between <ol> or <ul>) as well as HTML comment tags
41  * (<!-- -->).
42  *
43  * To save memory, the HTML parser is state machine driver, which
44  * means that it will shave off one character from the HTML page,
45  * process that character, and return to the next. Another way of
46  * doing it would be to buffer a number of characters and process them
47  * together.
48  *
49  * The main function in this file is the htmlparser_parse() function
50  * which takes a htmlparser_state structur and a part of an HTML file
51  * as an argument. The htmlparser_parse() function will call the
52  * helper functions parse_char() and parse_tag(). Those functions will
53  * in turn call the two callback functions htmlparser_char() and
54  * htmlparser_tag(). Those functions must be implemented by the using
55  * module (e.g., a web browser program).
56  *
57  * htmlparser_char() will be called for every non-tag character.
58  *
59  * htmlparser_tag() will be called whenever a full tag has been found.
60  *
61  */
62 
63 #include <string.h>
64 
65 #include "contiki.h"
66 #include "html-strings.h"
67 #include "www.h"
68 
69 #include "htmlparser.h"
70 
71 #if 1
72 #define PRINTF(x)
73 #else
74 #include <stdio.h>
75 #define PRINTF(x) printf x
76 #endif
77 
78 
79 /*-----------------------------------------------------------------------------------*/
80 #define ISO_A 0x41
81 #define ISO_B 0x42
82 #define ISO_E 0x45
83 #define ISO_F 0x46
84 #define ISO_G 0x47
85 #define ISO_H 0x48
86 #define ISO_I 0x49
87 #define ISO_L 0x4c
88 #define ISO_M 0x4d
89 #define ISO_P 0x50
90 #define ISO_R 0x52
91 #define ISO_T 0x54
92 
93 #define ISO_a (ISO_A | 0x20)
94 #define ISO_b (ISO_B | 0x20)
95 #define ISO_e (ISO_E | 0x20)
96 #define ISO_f (ISO_F | 0x20)
97 #define ISO_g (ISO_G | 0x20)
98 #define ISO_h (ISO_H | 0x20)
99 #define ISO_i (ISO_I | 0x20)
100 #define ISO_l (ISO_L | 0x20)
101 #define ISO_m (ISO_M | 0x20)
102 #define ISO_p (ISO_P | 0x20)
103 #define ISO_r (ISO_R | 0x20)
104 #define ISO_t (ISO_T | 0x20)
105 
106 #define ISO_ht 0x09
107 #define ISO_nl 0x0a
108 #define ISO_cr 0x0d
109 #define ISO_space 0x20
110 #define ISO_bang 0x21
111 #define ISO_citation 0x22
112 #define ISO_ampersand 0x26
113 #define ISO_citation2 0x27
114 #define ISO_asterisk 0x2a
115 #define ISO_dash 0x2d
116 #define ISO_slash 0x2f
117 #define ISO_semicolon 0x3b
118 #define ISO_lt 0x3c
119 #define ISO_eq 0x3d
120 #define ISO_gt 0x3e
121 
122 #define MINORSTATE_NONE 0
123 #define MINORSTATE_TEXT 1 /* Parse normal text */
124 #define MINORSTATE_EXTCHAR 2 /* Check for semi-colon */
125 #define MINORSTATE_TAG 3 /* Check for name of tag. */
126 #define MINORSTATE_TAGEND 4 /* Scan for end of tag. */
127 #define MINORSTATE_TAGATTR 5 /* Parse tag attr. */
128 #define MINORSTATE_TAGATTRSPACE 6 /* Parse optional space after tag
129  attr. */
130 #define MINORSTATE_TAGATTRPARAM 7 /* Parse tag attr parameter. */
131 #define MINORSTATE_TAGATTRPARAMNQ 8 /* Parse tag attr parameter without
132  quotation marks. */
133 #define MINORSTATE_HTMLCOMMENT 9 /* Scan for HTML comment end */
134 
135 #define MAJORSTATE_NONE 0
136 #define MAJORSTATE_BODY 1
137 #define MAJORSTATE_LINK 2
138 #define MAJORSTATE_FORM 3
139 #define MAJORSTATE_DISCARD 4
140 #define MAJORSTATE_SCRIPT 5
141 
142 struct htmlparser_state {
143 
144  unsigned char minorstate;
145  char tag[20];
146  unsigned char tagptr;
147  char tagattr[20];
148  unsigned char tagattrptr;
149  char tagattrparam[WWW_CONF_MAX_URLLEN + 1];
150  unsigned char tagattrparamptr;
151  unsigned char quotechar;
152  unsigned char majorstate, lastmajorstate;
153  char linkurl[WWW_CONF_MAX_URLLEN + 1];
154 
155  char word[WWW_CONF_WEBPAGE_WIDTH];
156  unsigned char wordlen;
157 
158 #if WWW_CONF_FORMS
159  char formaction[WWW_CONF_MAX_FORMACTIONLEN + 1];
160  unsigned char inputtype;
161  char inputname[WWW_CONF_MAX_INPUTNAMELEN + 1];
162  char inputvalue[WWW_CONF_MAX_INPUTVALUELEN + 1];
163  unsigned char inputvaluesize;
164 #endif /* WWW_CONF_FORMS */
165 };
166 
167 static struct htmlparser_state s;
168 
169 /*-----------------------------------------------------------------------------------*/
170 static char last[1] = {(char)0xff};
171 
172 static const char *tags[] = {
173 #define TAG_FIRST 0
174 #define TAG_SLASHA 0
175  html_slasha,
176 #define TAG_SLASHDIV 1
177  html_slashdiv,
178 #define TAG_SLASHFORM 2
179  html_slashform,
180 #define TAG_SLASHH 3
181  html_slashh,
182 #define TAG_SLASHSCRIPT 4
183  html_slashscript,
184 #define TAG_SLASHSELECT 5
185  html_slashselect,
186 #define TAG_SLASHSTYLE 6
187  html_slashstyle,
188 #define TAG_A 7
189  html_a,
190 #define TAG_BODY 8
191  html_body,
192 #define TAG_BR 9
193  html_br,
194 #define TAG_FORM 10
195  html_form,
196 #define TAG_H1 11
197  html_h1,
198 #define TAG_H2 12
199  html_h2,
200 #define TAG_H3 13
201  html_h3,
202 #define TAG_H4 14
203  html_h4,
204 #define TAG_IMG 15
205  html_img,
206 #define TAG_INPUT 16
207  html_input,
208 #define TAG_LI 17
209  html_li,
210 #define TAG_P 18
211  html_p,
212 #define TAG_SCRIPT 19
213  html_script,
214 #define TAG_SELECT 20
215  html_select,
216 #define TAG_STYLE 21
217  html_style,
218 #define TAG_TR 22
219  html_tr,
220 #define TAG_LAST 23
221  last,
222 };
223 
224 /*-----------------------------------------------------------------------------------*/
225 static unsigned char
226 iswhitespace(char c)
227 {
228  return (c == ISO_space ||
229  c == ISO_nl ||
230  c == ISO_cr ||
231  c == ISO_ht);
232 }
233 /*-----------------------------------------------------------------------------------*/
234 #if WWW_CONF_FORMS
235 static void
236 init_input(void)
237 {
238  s.inputtype = HTMLPARSER_INPUTTYPE_NONE;
239  s.inputname[0] = s.inputvalue[0] =
240  s.formaction[WWW_CONF_MAX_FORMACTIONLEN] =
241  s.inputname[WWW_CONF_MAX_INPUTNAMELEN] =
242  s.inputvalue[WWW_CONF_MAX_INPUTVALUELEN] = 0;
243  s.inputvaluesize = 20; /* De facto default size */
244 }
245 #endif /* WWW_CONF_FORMS */
246 /*-----------------------------------------------------------------------------------*/
247 void
248 htmlparser_init(void)
249 {
250  s.majorstate = s.lastmajorstate = MAJORSTATE_DISCARD;
251  s.minorstate = MINORSTATE_TEXT;
252  s.wordlen = 0;
253 #if WWW_CONF_FORMS
254  s.formaction[0] = 0;
255 #endif /* WWW_CONF_FORMS */
256 }
257 /*-----------------------------------------------------------------------------------*/
258 static char
259 lowercase(char c)
260 {
261  /* XXX: This is a *brute force* approach to lower-case
262  converting and should *not* be used anywhere else! It
263  works for our purposes, however (i.e., HTML tags). */
264  if(c > 0x40) {
265  return (c & 0x1f) | 0x60;
266  } else {
267  return c;
268  }
269 }
270 /*-----------------------------------------------------------------------------------*/
271 static void
272 endtagfound(void)
273 {
274  s.tag[s.tagptr] = 0;
275  s.tagattr[s.tagattrptr] = 0;
276  s.tagattrparam[s.tagattrparamptr] = 0;
277 }
278 /*-----------------------------------------------------------------------------------*/
279 static void
280 switch_majorstate(unsigned char newstate)
281 {
282  if(s.majorstate != newstate) {
283  PRINTF(("Switching state from %d to %d (%d)\n", s.majorstate, newstate, s.lastmajorstate));
284  s.lastmajorstate = s.majorstate;
285  s.majorstate = newstate;
286  }
287 }
288 /*-----------------------------------------------------------------------------------*/
289 static void
290 add_char(unsigned char c)
291 {
292  if(s.wordlen < WWW_CONF_WEBPAGE_WIDTH - 1 && c < 0x80) {
293  s.word[s.wordlen] = c;
294  ++s.wordlen;
295  }
296 }
297 /*-----------------------------------------------------------------------------------*/
298 static void
299 do_word(void)
300 {
301  if(s.wordlen > 0) {
302  if(s.majorstate == MAJORSTATE_LINK) {
303  if(s.word[s.wordlen - 1] != ISO_space) {
304  add_char(ISO_space);
305  }
306  } else if(s.majorstate >= MAJORSTATE_DISCARD) {
307  s.wordlen = 0;
308  } else {
309  s.word[s.wordlen] = '\0';
310  htmlparser_word(s.word, s.wordlen);
311  s.wordlen = 0;
312  }
313  }
314 }
315 /*-----------------------------------------------------------------------------------*/
316 static void
317 newline(void)
318 {
319  do_word();
320  htmlparser_newline();
321 }
322 /*-----------------------------------------------------------------------------------*/
323 static unsigned char
324 find_tag(char *tag)
325 {
326  static unsigned char first, last, i, tabi;
327  static char tagc;
328 
329  first = TAG_FIRST;
330  last = TAG_LAST;
331  i = 0;
332 
333  do {
334  tagc = tag[i];
335 
336  if((tagc == 0 || tagc == ISO_slash) && tags[first][i] == 0) {
337  return first;
338  }
339 
340  tabi = first;
341 
342  /* First, find first matching tag from table. */
343  while(tagc > (tags[tabi])[i] && tabi < last) {
344  ++tabi;
345  }
346  first = tabi;
347 
348  /* Second, find last matching tag from table. */
349  while(tagc == (tags[tabi])[i] && tabi < last) {
350  ++tabi;
351  }
352  last = tabi;
353 
354  /* If first and last matching tags are equal, we have a non-match
355  and return. Else we continue with the next character. */
356  ++i;
357 
358  } while(last != first);
359  return TAG_LAST;
360 }
361 /*-----------------------------------------------------------------------------------*/
362 static void
363 parse_tag(void)
364 {
365  static char *tagattrparam;
366  static unsigned char tag;
367  static unsigned char size;
368 
369  tag = find_tag(s.tag);
370  /* If we are inside a <script> we mustn't interpret any tags
371  (inside JavaScript strings) but wait for the </script>. */
372  if(s.majorstate == MAJORSTATE_SCRIPT && tag != TAG_SLASHSCRIPT) {
373  return;
374  }
375 
376  PRINTF(("Parsing tag '%s' '%s' '%s'\n", s.tag, s.tagattr, s.tagattrparam));
377 
378  switch(tag) {
379  case TAG_P:
380  case TAG_H1:
381  case TAG_H2:
382  case TAG_H3:
383  case TAG_H4:
384  newline();
385  /* FALLTHROUGH */
386  case TAG_BR:
387  case TAG_TR:
388  case TAG_SLASHDIV:
389  case TAG_SLASHH:
390  newline();
391  break;
392  case TAG_LI:
393  if(s.tagattr[0] == 0) {
394  newline();
395  add_char(ISO_asterisk);
396  add_char(ISO_space);
397  }
398  break;
399  case TAG_SCRIPT:
400  switch_majorstate(MAJORSTATE_SCRIPT);
401  break;
402  case TAG_STYLE:
403  case TAG_SELECT:
404  switch_majorstate(MAJORSTATE_DISCARD);
405  break;
406  case TAG_SLASHSCRIPT:
407  case TAG_SLASHSTYLE:
408  case TAG_SLASHSELECT:
409  do_word();
410  switch_majorstate(s.lastmajorstate);
411  break;
412  case TAG_BODY:
413  s.majorstate = s.lastmajorstate = MAJORSTATE_BODY;
414  break;
415  case TAG_IMG:
416  if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 && s.tagattrparam[0] != 0) {
417  add_char(ISO_lt);
418  tagattrparam = &s.tagattrparam[0];
419  while(*tagattrparam) {
420  add_char(*tagattrparam);
421  ++tagattrparam;
422  }
423  add_char(ISO_gt);
424  do_word();
425  }
426  break;
427  case TAG_A:
428  PRINTF(("A %s %s\n", s.tagattr, s.tagattrparam));
429  if(strncmp(s.tagattr, html_href, sizeof(html_href)) == 0 && s.tagattrparam[0] != 0) {
430  strcpy(s.linkurl, s.tagattrparam);
431  do_word();
432  switch_majorstate(MAJORSTATE_LINK);
433  }
434  break;
435  case TAG_SLASHA:
436  if(s.majorstate == MAJORSTATE_LINK) {
437  switch_majorstate(s.lastmajorstate);
438  s.word[s.wordlen] = 0;
439  htmlparser_link(s.word, s.wordlen, s.linkurl);
440  s.wordlen = 0;
441  }
442  break;
443 #if WWW_CONF_FORMS
444  case TAG_FORM:
445  /* First check if we are called at the end of a form tag. If
446  so, we should propagate the form action. */
447  if(s.tagattr[0] == 0 && s.formaction[0] != 0) {
448  htmlparser_form(s.formaction);
449  init_input();
450  } else {
451  PRINTF(("Form tag\n"));
452  switch_majorstate(MAJORSTATE_FORM);
453  if(strncmp(s.tagattr, html_action, sizeof(html_action)) == 0) {
454  PRINTF(("Form action '%s'\n", s.tagattrparam));
455  strncpy(s.formaction, s.tagattrparam, WWW_CONF_MAX_FORMACTIONLEN - 1);
456  }
457  }
458  break;
459  case TAG_SLASHFORM:
460  switch_majorstate(MAJORSTATE_BODY);
461  s.formaction[0] = 0;
462  break;
463  case TAG_INPUT:
464  if(s.majorstate == MAJORSTATE_FORM) {
465  /* First check if we are called at the end of an input tag. If
466  so, we should render the input widget. */
467  if(s.tagattr[0] == 0 && s.inputname[0] != 0) {
468  PRINTF(("Render input type %d\n", s.inputtype));
469  switch(s.inputtype) {
470  case HTMLPARSER_INPUTTYPE_NONE:
471  case HTMLPARSER_INPUTTYPE_TEXT:
472  case HTMLPARSER_INPUTTYPE_HIDDEN:
473  htmlparser_inputfield(s.inputtype, s.inputvaluesize, s.inputvalue, s.inputname);
474  break;
475  case HTMLPARSER_INPUTTYPE_SUBMIT:
476  case HTMLPARSER_INPUTTYPE_IMAGE:
477  htmlparser_submitbutton(s.inputvalue, s.inputname);
478  break;
479  }
480  init_input();
481  } else {
482  PRINTF(("Input '%s' '%s'\n", s.tagattr, s.tagattrparam));
483  if(strncmp(s.tagattr, html_type, sizeof(html_type)) == 0) {
484  if(strncmp(s.tagattrparam, html_submit, sizeof(html_submit)) == 0) {
485  s.inputtype = HTMLPARSER_INPUTTYPE_SUBMIT;
486  } else if(strncmp(s.tagattrparam, html_image, sizeof(html_image)) == 0) {
487  s.inputtype = HTMLPARSER_INPUTTYPE_IMAGE;
488  } else if(strncmp(s.tagattrparam, html_text, sizeof(html_text)) == 0) {
489  s.inputtype = HTMLPARSER_INPUTTYPE_TEXT;
490  } else if(strncmp(s.tagattrparam, html_hidden, sizeof(html_hidden)) == 0) {
491  s.inputtype = HTMLPARSER_INPUTTYPE_HIDDEN;
492  } else {
493  s.inputtype = HTMLPARSER_INPUTTYPE_OTHER;
494  }
495  } else if(strncmp(s.tagattr, html_name, sizeof(html_name)) == 0) {
496  strncpy(s.inputname, s.tagattrparam, WWW_CONF_MAX_INPUTNAMELEN);
497  } else if(strncmp(s.tagattr, html_alt, sizeof(html_alt)) == 0 &&
498  s.inputtype == HTMLPARSER_INPUTTYPE_IMAGE) {
499  strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
500  } else if(strncmp(s.tagattr, html_value, sizeof(html_value)) == 0) {
501  strncpy(s.inputvalue, s.tagattrparam, WWW_CONF_MAX_INPUTVALUELEN);
502  } else if(strncmp(s.tagattr, html_size, sizeof(html_size)) == 0) {
503  size = 0;
504  if(s.tagattrparam[0] >= '0' &&
505  s.tagattrparam[0] <= '9') {
506  size = s.tagattrparam[0] - '0';
507  if(s.tagattrparam[1] >= '0' &&
508  s.tagattrparam[1] <= '9') {
509  size = size * 10 + (s.tagattrparam[1] - '0');
510  }
511  }
512  if(size >= WWW_CONF_MAX_INPUTVALUELEN) {
513  size = WWW_CONF_MAX_INPUTVALUELEN - 1;
514  }
515  s.inputvaluesize = size;
516  }
517  }
518  }
519  break;
520 #endif /* WWW_CONF_FORMS */
521  }
522 }
523 /*-----------------------------------------------------------------------------------*/
524 static uint16_t
525 parse_word(char *data, uint8_t dlen)
526 {
527  static uint8_t i;
528  static uint8_t len;
529  unsigned char c;
530 
531  len = dlen;
532 
533  switch(s.minorstate) {
534  case MINORSTATE_TEXT:
535  for(i = 0; i < len; ++i) {
536  c = data[i];
537  if(iswhitespace(c)) {
538  do_word();
539  } else if(c == ISO_lt) {
540  s.minorstate = MINORSTATE_TAG;
541  s.tagptr = 0;
542  break;
543  } else if(c == ISO_ampersand) {
544  s.minorstate = MINORSTATE_EXTCHAR;
545  break;
546  } else {
547  add_char(c);
548  }
549  }
550  break;
551  case MINORSTATE_EXTCHAR:
552  for(i = 0; i < len; ++i) {
553  c = data[i];
554  if(c == ISO_semicolon) {
555  s.minorstate = MINORSTATE_TEXT;
556  add_char(' ');
557  break;
558  } else if(iswhitespace(c)) {
559  s.minorstate = MINORSTATE_TEXT;
560  add_char('&');
561  add_char(' ');
562  break;
563  }
564  }
565  break;
566  case MINORSTATE_TAG:
567  /* If we are inside a <srcipt> we mustn't mistake a JavaScript
568  equation with a '<' as a tag. So we check for the very next
569  character to be a '/' as we're only interested in parsing
570  the </script>. */
571  if(s.majorstate == MAJORSTATE_SCRIPT && data[0] != ISO_slash) {
572  s.minorstate = MINORSTATE_TEXT;
573  break;
574  }
575 
576  /* We are currently parsing within the name of a tag. We check
577  for the end of a tag (the '>' character) or whitespace (which
578  indicates that we should parse a tag attr argument
579  instead). */
580  for(i = 0; i < len; ++i) {
581  c = data[i];
582  if(c == ISO_gt) {
583  /* Full tag found. We continue parsing regular text. */
584  s.minorstate = MINORSTATE_TEXT;
585  s.tagattrptr = s.tagattrparamptr = 0;
586  endtagfound();
587  parse_tag();
588  break;
589  } else if(iswhitespace(c)) {
590  /* The name of the tag found. We continue parsing the tag
591  attr.*/
592  s.minorstate = MINORSTATE_TAGATTR;
593  s.tagattrptr = 0;
594  endtagfound();
595  break;
596  } else {
597  /* Keep track of the name of the tag, but convert it to
598  lower case. */
599  s.tag[s.tagptr] = lowercase(c);
600  ++s.tagptr;
601  /* Check if the ->tag field is full. If so, we just eat up
602  any data left in the tag. */
603  if(s.tagptr == sizeof(s.tag)) {
604  s.minorstate = MINORSTATE_TAGEND;
605  break;
606  }
607  }
608 
609  /* Check for HTML comment, indicated by <!-- */
610  if(s.tagptr == 3 &&
611  s.tag[0] == ISO_bang &&
612  s.tag[1] == ISO_dash &&
613  s.tag[2] == ISO_dash) {
614  PRINTF(("Starting comment...\n"));
615  s.minorstate = MINORSTATE_HTMLCOMMENT;
616  s.tagptr = 0;
617  endtagfound();
618  break;
619  }
620  }
621  break;
622  case MINORSTATE_TAGATTR:
623  /* We parse the "tag attr", i.e., the "href" in <a
624  href="...">. */
625  for(i = 0; i < len; ++i) {
626  c = data[i];
627  if(c == ISO_gt) {
628  /* Full tag found. */
629  s.minorstate = MINORSTATE_TEXT;
630  s.tagattrparamptr = 0;
631  s.tagattrptr = 0;
632  endtagfound();
633  parse_tag();
634  s.tagptr = 0;
635  endtagfound();
636  break;
637  } else if(iswhitespace(c)) {
638  if(s.tagattrptr == 0) {
639  /* Discard leading spaces. */
640  } else {
641  /* A non-leading space is the end of the attribute. */
642  s.tagattrparamptr = 0;
643  endtagfound();
644  parse_tag();
645  s.minorstate = MINORSTATE_TAGATTRSPACE;
646  break;
647  }
648  } else if(c == ISO_eq) {
649  s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
650  s.tagattrparamptr = 0;
651  endtagfound();
652  break;
653  } else {
654  s.tagattr[s.tagattrptr] = lowercase(c);
655  ++s.tagattrptr;
656  /* Check if the "tagattr" field is full. If so, we just eat
657  up any data left in the tag. */
658  if(s.tagattrptr == sizeof(s.tagattr)) {
659  s.minorstate = MINORSTATE_TAGEND;
660  break;
661  }
662  }
663  }
664  break;
665  case MINORSTATE_TAGATTRSPACE:
666  for(i = 0; i < len; ++i) {
667  c = data[i];
668  if(iswhitespace(c)) {
669  /* Discard spaces. */
670  } else if(c == ISO_eq) {
671  s.minorstate = MINORSTATE_TAGATTRPARAMNQ;
672  s.tagattrparamptr = 0;
673  endtagfound();
674  parse_tag();
675  break;
676  } else {
677  s.tagattr[0] = lowercase(c);
678  s.tagattrptr = 1;
679  s.minorstate = MINORSTATE_TAGATTR;
680  break;
681  }
682  }
683  break;
684  case MINORSTATE_TAGATTRPARAMNQ:
685  /* We are parsing the "tag attr parameter", i.e., the link part
686  in <a href="link">. */
687  for(i = 0; i < len; ++i) {
688  c = data[i];
689  if(c == ISO_gt) {
690  /* Full tag found. */
691  endtagfound();
692  parse_tag();
693  s.minorstate = MINORSTATE_TEXT;
694  s.tagattrptr = 0;
695  endtagfound();
696  parse_tag();
697  s.tagptr = 0;
698  endtagfound();
699  break;
700  } else if(iswhitespace(c) && s.tagattrparamptr == 0) {
701  /* Discard leading spaces. */
702  } else if((c == ISO_citation ||
703  c == ISO_citation2) && s.tagattrparamptr == 0) {
704  s.minorstate = MINORSTATE_TAGATTRPARAM;
705  s.quotechar = c;
706  PRINTF(("tag attr param q found\n"));
707  break;
708  } else if(iswhitespace(c)) {
709  PRINTF(("Non-leading space found at %d\n", s.tagattrparamptr));
710  /* Stop parsing if a non-leading space was found */
711  endtagfound();
712  parse_tag();
713 
714  s.minorstate = MINORSTATE_TAGATTR;
715  s.tagattrptr = 0;
716  endtagfound();
717  break;
718  } else {
719  s.tagattrparam[s.tagattrparamptr] = c;
720  ++s.tagattrparamptr;
721  /* Check if the "tagattr" field is full. If so, we just eat
722  up any data left in the tag. */
723  if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
724  s.minorstate = MINORSTATE_TAGEND;
725  break;
726  }
727  }
728  }
729  break;
730  case MINORSTATE_TAGATTRPARAM:
731  /* We are parsing the "tag attr parameter", i.e., the link
732  part in <a href="link">. */
733  for(i = 0; i < len; ++i) {
734  c = data[i];
735  if(c == s.quotechar) {
736  /* Found end of tag attr parameter. */
737  endtagfound();
738  parse_tag();
739 
740  s.minorstate = MINORSTATE_TAGATTR;
741  s.tagattrptr = 0;
742  endtagfound();
743  break;
744  } else {
745  if(iswhitespace(c)) {
746  s.tagattrparam[s.tagattrparamptr] = ISO_space;
747  } else {
748  s.tagattrparam[s.tagattrparamptr] = c;
749  }
750 
751  ++s.tagattrparamptr;
752  /* Check if the "tagattr" field is full. If so, we just eat
753  up any data left in the tag. */
754  if(s.tagattrparamptr >= sizeof(s.tagattrparam) - 1) {
755  s.minorstate = MINORSTATE_TAGEND;
756  break;
757  }
758  }
759  }
760  break;
761  case MINORSTATE_HTMLCOMMENT:
762  for(i = 0; i < len; ++i) {
763  c = data[i];
764  if(c == ISO_dash) {
765  ++s.tagptr;
766  } else if(c == ISO_gt && s.tagptr > 0) {
767  PRINTF(("Comment done.\n"));
768  s.minorstate = MINORSTATE_TEXT;
769  break;
770  } else {
771  s.tagptr = 0;
772  }
773  }
774  break;
775  case MINORSTATE_TAGEND:
776  /* Discard characters until a '>' is seen. */
777  for(i = 0; i < len; ++i) {
778  if(data[i] == ISO_gt) {
779  s.minorstate = MINORSTATE_TEXT;
780  s.tagattrptr = 0;
781  endtagfound();
782  parse_tag();
783  break;
784  }
785  }
786  break;
787  default:
788  i = 0;
789  break;
790  }
791  if(i >= len) {
792  return len;
793  }
794  return i + 1;
795 }
796 /*-----------------------------------------------------------------------------------*/
797 void
798 htmlparser_parse(char *data, uint16_t datalen)
799 {
800  uint16_t plen;
801 
802  while(datalen > 0) {
803  if(datalen > 255) {
804  plen = parse_word(data, 255);
805  } else {
806  plen = parse_word(data, (uint8_t)datalen);
807  }
808  datalen -= plen;
809  data += plen;
810  }
811 }
812 /*-----------------------------------------------------------------------------------*/