2 * CDE - Common Desktop Environment
4 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
6 * These libraries and programs are free software; you can
7 * redistribute them and/or modify them under the terms of the GNU
8 * Lesser General Public License as published by the Free Software
9 * Foundation; either version 2 of the License, or (at your option)
12 * These libraries and programs are distributed in the hope that
13 * they will be useful, but WITHOUT ANY WARRANTY; without even the
14 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 * PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with these librararies and programs; if not, write
20 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
21 * Floor, Boston, MA 02110-1301 USA
23 /* $XConsortium: pars2.c /main/3 1996/06/19 17:16:36 drk $ */
24 #include "sgmlincl.h" /* #INCLUDE statements for SGML parser. */
25 /* PARSE: Parse a source input stream with specified lexical and state tables.
26 Return to caller with action code.
29 struct parse *pcb; /* Current parse control block. */
31 int rc; /* Return code from ENTREF. */
35 pcb->input = pcb->plex[*FPOS];
36 pcb->state = pcb->newstate;
37 pcb->newstate = (*(pcb->ptab + pcb->state)) [pcb->input];
38 pcb->action = (*(pcb->ptab + pcb->state + 1)) [pcb->input];
40 switch (pcb->action) {
41 case RC2_: /* Back up two characters. */
43 case RCC_: /* Repeat current character. */
45 case NOP_: /* No action necessary.*/
48 case RS_: /* Record start: ccnt=0; ++rcnt.*/
52 case GET_: /* EOB or dull EOS or EE found: keep going.*/
53 if (entget()==-1) {pcb->action = EOD_; break;}/* Signal if EOD.*/
56 case EOF_: /* Illegal entity end; return EE_. */
59 case EE_: /* Important EOS or EE found: return to caller.*/
60 if (entget()==-1) pcb->action = EOD_; /* Signal if EOD. */
63 case PER_: /* Parameter entity reference. */
64 REPEATCC; /* Use PERO as 1st char of entity name. */
65 parsenm(entbuf, ENTCASE);
66 parse(&pcbref); /* Handle REFC or other terminator. */
68 if (rc==ENTPI) {pcb->action = PIE_; break;}
71 case ER_: /* General entity reference; continue. */
72 parsenm(entbuf, ENTCASE);
73 parse(&pcbref); /* Handle REFC or other terminator. */
75 if (rc==ENTDATA) {pcb->action = DEF_; break;}
76 if (rc==ENTPI) {pcb->action = PIE_; break;}
80 case PEX_: /* Parameter entity reference; return. */
81 REPEATCC; /* Use PERO as 1st char of entity name. */
82 case ERX_: /* General entity reference; return. */
83 parsenm(entbuf, ENTCASE);
84 parse(&pcbref); /* Handle REFC or other terminator. */
87 /* Reference to external data/subdoc entity in replaceable
89 if (BITON(entdatsw, NDECONT)) {
90 switch (((PNE)data)->nextype) {
93 /* The standard says `non-SGML data entity'
94 but the amendment should have changed it
95 to `external data entity'. */
100 /* This is definitely illegal. */
109 else if (rc == ENTPI) {
110 /* Reference to PI entity not allowed in replaceable
116 else if (rc) pcb->action = EE_;
119 case CRN_: /* Character reference: numeric. */
120 parsetkn(entbuf, NU, NAMELEN);
121 parse(&pcbref); /* Handle reference terminator. */
122 pcb->action = charrefn(entbuf, pcb);
123 if (pcb->action==CRN_) continue; /* Invalid reference */
126 case CRA_: /* Character reference: alphabetic. */
127 parsenm(entbuf, NAMECASE);
128 parse(&pcbref); /* Handle reference terminator. */
132 case SYS_: /* Invalid NONCHAR: send msg and ignore. */
134 if (*FPOS == DELNONCH) NEWCC;
137 case NON_: /* Valid NONCHAR: prefix and shift encoding. */
139 pcb->action = datachar(*FPOS, pcb);
147 case PCI_: /* Previous character was invalid (INV_). */
149 case INV_: /* Markup ended by invalid char; repeat char. */
154 case LNR_: /* Previous char exceeded len; back up to it. */
156 case LEN_: /* Token too long; ignore excess character. */
160 case RCR_: /* Repeat current char and return to caller. */
162 default: /* Actions for specific parse. */
165 return (int)pcb->action;
168 /* CHARREFA: Resolve an alphabetical reference to a function character
169 and put the character in the read buffer.
170 If reference is bad, issue an error message.
173 UNCH *r; /* Undelimited char ref (with length and EOS). */
177 thechar = mapsrch(funtab, r+1);
181 /* This isn't ideal, because the character position will still
182 be wrong for one line. */
183 if (thechar == RSCHAR) RCNT--;
189 /* Make the current character ch. */
194 /* If we're reading directly from an internal entity, we can't
195 change the entity, since the entity might be referenced again.
196 So in this case we copy the entity. This is inefficient, but
197 it will only happen in a case like this:
200 <!entity e "x%amp;#SPACE;">
202 Usually character references will have been processed while the
203 entity was being defined. */
205 if (!FILESW && !COPIEDSW) {
206 UNCH *s = savestr(FBUF + 1);
207 FPOS = s + (FPOS - FBUF - 1);
215 /* CHARREFN: Resolve a numeric character reference.
216 If reference is bad, issue an error message.
220 UNCH *r; /* Undelimited character reference. */
221 struct parse *pcb; /* Current parse control block. */
225 thechar = atoi((char *)r);
226 if (thechar<0 || thechar>255) {
228 return((int)pcb->action);
230 return datachar(thechar, pcb);
233 /* Return ch as a datachar. If this a non-SGML character which might
234 confuse the parser, shift it to a code that won't and place it in a
235 special buffer which has DELNONCH in the preceding byte. Otherwise
236 put it the read buffer. */
238 int datachar(ch, pcb)
250 /* A potentially confusing character which must be prefixed
252 nonchbuf[1] = SHIFTNON((UNCH)ch);
256 /* If in content, return DCE_ for element content, DAF_ for mixed. */
257 /* If not content, it must be a literal parse, so return MLA_. */
263 /* Action for DAF_ will do REPEATCC. */
271 /* INITATT: Initialize al with adl. */
276 notadn = 0; /* No NOTATION attribute yet. */
277 conrefsw = 0; /* Assume no content reference att. */
278 /* Copy attribute definition list as a template. */
279 memcpy((UNIV)al, (UNIV)adl, (1+ADN(adl))*ADSZ);
282 /* PARSEATT: Parse attribute specification list.
283 Make a current copy of the attribute definition list
284 and update it with the user's specifications.
285 Indicate each attribute that was specified in the
286 list (as opposed to defaulted) by setting the ASPEC flag.
287 If no attributes were specified, return NULL. Otherwise,
288 if in the prolog, make a permanent copy of the list and
289 return its pointer. If not in the prolog, return al.
291 struct ad *parseatt(adl, pt)
292 struct ad *adl; /* Attribute definition list. */
293 UNCH *pt; /* Tokenization area: tbuf[TAGLEN+ATTSPLEN]. */
296 UNCH *nm = 0; /* Pointer to saved name in tbuf (with length). */
297 int adn = -1; /* Position of attribute in list (-1=empty). */
298 UNCH *tbuflim = pt + ATTSPLEN;
299 mdessv = es; /* Save es for checking entity nesting. */
301 while (pt<=tbuflim) {
303 switch (pcbstag.action) {
304 case NVS: /* Att name or value token found. */
305 parsenm(pt, NAMECASE); /* Case translation wanted on name. */
306 pt += *(nm = pt); /* Save name while pointing past it. */
309 case AVD: /* Delimited value found. */
310 case AVDA: /* Delimited value found (alternate delimiter). */
311 /* Find position (adn) of saved attribute name in list. */
312 adn = anmget((int)ADN(al), nm);
314 (adn == 0 || ADTYPE(al, adn) == ACHARS)
318 (pcbstag.action==AVD) ? lex.d.lit : lex.d.lita);
320 /* Error: unrecognized attribute name. */
321 sgmlerr(13, &pcbstag, nm+1, pt);
324 /* Tokenize and validate value; let it default if an error. */
325 /* Put value in list and bump ptr by the normalized length
326 (which is always >= the actual length). */
327 if (!attval(1, pt, adn, adl)) pt += ADLEN(al,adn);
329 case AVU: /* Attribute value found: undelimited. */
330 if (!sd.shorttag) sgmlerr(196, &pcbstag, (UNCH *)0, (UNCH *)0);
331 parsetkn(pt, NMC, LITLEN);
332 /* Find position (adn) of saved attribute name in list. */
333 if ((adn = anmget((int)ADN(al), nm))==0) {
334 /* Error: unrecognized attribute name. */
335 sgmlerr(13, &pcbstag, nm+1, pt);
338 /* Tokenize and validate value; let it default if an error. */
339 /* Put value in list and bump ptr by the normalized length
340 (which is always >= the actual length). */
341 if (!attval(1, pt, adn, adl)) pt += ADLEN(al,adn);
344 case NASV: /* Saved NVS was really an NTV. */
345 REPEATCC; /* Put back next token starter. */
346 pt = nm; /* Back up to NVS. */
347 case NTV: /* Name token value found. */
348 if (!sd.shorttag) sgmlerr(195, &pcbstag, (UNCH *)0, (UNCH *)0);
349 if (pcbstag.action==NTV) parsenm(pt, NAMECASE);
350 if ((adn = antvget((int)ADN(al), pt, &antvptr))==0) {
351 /* Error: unrecognized name token value. */
352 sgmlerr(74, &pcbstag, pt+1, (UNCH *)0);
355 /* Validate value; let it default if an error. */
356 /* Put value in list and bump ptr by the normalized length
357 (which is always >= the actual length). */
358 if (!attval(0, antvptr+1, adn, adl)) pt += ADLEN(al,adn);
361 default: /* All attributes have been parsed. */
362 REPEATCC; /* Put next char back for tag close parse. */
367 if (pt>tbuflim) synerr(75, &pcbstag);
368 if (es!=mdessv) synerr(37, &pcbstag);
369 if (adn<0) return((struct ad *)0); /* List was empty. */
373 /* ATTVAL: Validate a specified attribute value. Issue a message if it is
374 the wrong type (or otherwise is not up to spec), and use the default.
375 Call PARSEVAL to tokenize the value, unless it is a CDATA string.
376 If the attribute is a group, the value is a string.
377 For other types, the token count is set by PARSEVAL if the value
378 is syntactically correct. If incorrect (or if CDATA) the token
379 count is zero (i.e., the value is a string).
380 The length of a token does not include the length byte, and
381 there is no EOS. A string length (as always) includes both
382 the length byte and the EOS.
383 If it is a CONREF attribute, set a switch for STAG().
384 If it is a CURRENT attribute, store the value as the new default.
386 #define DEFVAL adl[adn].addef /* Default value of current attribute. */
387 #define DEFNUM adl[adn].adnum /* Default group size of current attribute. */
388 #define DEFLEN adl[adn].adlen /* Length of default value of current attribute.*/
389 int attval(mtvsw, adval, adn, adl)
390 int mtvsw; /* Must tokenize value: 1=yes; 0=no. */
391 UNCH *adval; /* Untokenized attribute value. */
392 int adn; /* Attribute's position in list. */
393 struct ad *adl; /* Element's master att def list. */
395 int errcode; /* Value/declaration conflict error code. */
397 if (GET(ADFLAGS(al,adn), ASPEC)) /* Can't respecify same attribute. */
398 {sgmlerr(73, &pcbstag, ADNAME(al,adn), adval); return(1);}
399 SET(ADFLAGS(al,adn), ASPEC); /* Indicate att was specified. */
400 if (GET(ADFLAGS(al,adn), ACONREF)) /* If attribute is content reference: */
401 conrefsw = TAGREF; /* Set switch for STAG(). */
402 if (mtvsw && ADTYPE(al,adn)!=ACHARS) {
403 /* If no syntax errors, check for proper group membership. */
404 if ( ((errcode = parseval(adval, ADTYPE(al,adn), lbuf))==0)
405 && GET(ADFLAGS(al,adn), AGROUP)
406 && !amemget(&al[adn], ADNUM(al,adn), lbuf) ) errcode = 18;
407 /* If syntax or group membership error, send message and exit. */
409 sgmlerr(errcode, &pcbstag, ADNAME(al,adn), adval);
410 SET(ADFLAGS(al,adn), AERROR);
413 /* Replace specified value in adval with tokenized in lbuf. */
414 ustrcpy(adval, lbuf);
415 if (BITOFF(ADFLAGS(al,adn), AGROUP)) ADNUM(al,adn) = (UNCH)tokencnt;
419 /* If attribute is FIXED, specified value must equal default. */
420 if (BITON(ADFLAGS(al,adn), AFIXED) && ustrcmp(adval, DEFVAL)) {
421 /* Since the value has been tokenized, don't use it in the
423 sgmlerr(67, &pcbstag, ADNAME(al,adn), (UNCH *)0);
424 SET(ADFLAGS(al,adn), AERROR);
427 ADLEN(al,adn) = vallen(ADTYPE(al,adn), ADNUM(al,adn), adval);
428 if (ADLEN(al,adn) > LITLEN) {
429 sgmlerr(224, &pcbstag, ADNAME(al,adn), (UNCH *)0);
430 SET(ADFLAGS(al,adn), AERROR);
433 ADVAL(al,adn) = adval;
434 /* If attribute is CURRENT, value is new default.*/
435 if (GET(ADFLAGS(al,adn), ACURRENT)) {
436 if (ADLEN(al,adn)>DEFLEN) {
437 ds.attdef += (ADLEN(al,adn) - DEFLEN);
438 DEFLEN = ADLEN(al,adn);
440 DEFVAL = replace(DEFVAL, ADVAL(al,adn));
441 DEFNUM = ADNUM(al,adn);
443 return(0); /* Indicate value was valid. */
445 /* ADLVAL: Validate the completed attribute definition list (defaults plus
446 specified values). Issue a message if an
447 attribute is required or current and its value is NULL.
449 VOID adlval(adsz, newetd)
450 int adsz; /* Size of list. */
451 struct etd *newetd; /* Element type definition for this element. */
453 int adn = 1; /* Position in list. */
454 UNCH *npt, *pt; /* Ptr save areas. */
455 UNCH nptsv; /* Save area for ptr value (length?). */
456 struct dcncb *dpt; /* Save area for dcncb ptr. */
458 aentctr = 0; /* Number of AENTITY tokens in this att list. */
459 idrctr = 0; /* Number of IDREF tokens in this att list. */
461 if (ADVAL(al,adn)==NULL) { /* NULL value */
462 if (GET(ADFLAGS(al,adn), AREQ+ACURRENT)) { /*Error if REQ, CURRENT*/
463 sgmlerr(19, &pcbstag, ADNAME(al,adn), (UNCH *)0);
464 SET(ADFLAGS(al,adn), AINVALID);
467 else switch (ADTYPE(al,adn)) {
468 case AENTITY: /* Return data ecb pointer if valid entity. */
469 aenttst(adn, ADVAL(al,adn));
471 case AENTITYS: /* Return data ecb pointers if valid entities. */
473 tokencnt = (int)ADNUM(al,adn);
475 nptsv = *(npt = pt + *pt+1);
476 *pt += 2; *npt = EOS;
478 *pt -= 2; *(pt = npt) = nptsv;
482 /* Define ID; msg if it already exists. */
483 if (iddef(ADVAL(al,adn))) {
484 sgmlerr(71, &pcbstag, ADNAME(al,adn), ADVAL(al,adn)+1);
485 SET(ADFLAGS(al,adn), AINVALID);
491 idreftst(adn, ADVAL(al,adn));
495 tokencnt = (int)ADNUM(al,adn);
497 nptsv = *(npt = pt + *pt+1);
498 *pt += 2; *npt = EOS;
500 *pt -= 2; *(pt = npt) = nptsv;
503 case ANOTEGRP: /* Return notation identifier. */
504 if (GET(ADFLAGS(al,adn), ASPEC)) notadn = adn;/*NOTATION specified*/
505 if ((dpt = dcnfind(ADVAL(al,adn)))==0) {
506 sgmlerr(77, &pcbstag, ADNAME(al,adn), ADVAL(al,adn)+1);
507 SET(ADFLAGS(al,adn), AINVALID);
509 else ADDATA(al,adn).x = dpt;
512 if (!sd.shorttag && !sd.omittag && ADVAL(al,adn)!=NULL
513 && !GET(ADFLAGS(al,adn), ASPEC+AINVALID))
514 sgmlerr(197, &pcbstag, ADNAME(al,adn), (UNCH *)0);
515 } while ((adn+=BITON(ADFLAGS(al,adn),AGROUP) ? (int)ADNUM(al,adn)+1 : 1)<=adsz);
517 /* Error if NOTATION specified with CONREF attribute or EMPTY element. */
518 if (notadn && (conrefsw
519 || (newetd && GET(newetd->etdmod->ttype, MNONE)))) {
520 sgmlerr((UNS)(conrefsw ? 84 : 76), &pcbstag,
521 ADNAME(al,notadn), ADVAL(al,notadn)+1);
522 SET(ADFLAGS(al,notadn), AINVALID);
525 /* AENTTST: Validate an individual ENTITY token in AENTITY or AENTITYS value.
527 VOID aenttst(adn, pt)
528 int adn; /* Position in list. */
529 UNCH *pt; /* Ptr to current ENTITY token in value. */
531 struct entity *ept; /* Save area for ecb ptr. */
533 if (++aentctr>GRPCNT) {
534 sgmlerr(136, &pcbstag, ADNAME(al,adn), pt+1);
535 SET(ADFLAGS(al,adn), AINVALID);
538 if ( (ept = entfind(pt))==0
539 && (ecbdeflt==0 || (ept = usedef(pt))==0) ) {
540 sgmlerr(ecbdeflt ? 151 : 72, &pcbstag, ADNAME(al,adn), pt+1);
541 SET(ADFLAGS(al,adn), AINVALID);
544 if (ept->estore==ESX || ept->estore==ESC || ept->estore==ESN) {
545 /* Error if DCN has no notation identifier. */
546 if (ept->estore==ESN && NEXTYPE(ept->etx.n)!=ESNSUB
547 && !NEDCNDEFINED(ept->etx.n)) {
548 sgmlerr(78, &pcbstag, NEDCN(ept->etx.n)+1,
550 SET(ADFLAGS(al,adn), AINVALID);
554 sgmlerr(86, &pcbstag, ADNAME(al,adn), pt+1);
555 SET(ADFLAGS(al,adn), AINVALID);
558 /* IDREFTST: Validate an individual IDREF token in an IDREF or IDREFS value.
560 VOID idreftst(adn, pt)
561 int adn; /* Position in list. */
562 UNCH *pt; /* Ptr to current IDREF token in value. */
565 if (++idrctr>GRPCNT) {
566 sgmlerr(70, &pcbstag, ADNAME(al,adn), pt+1);
567 SET(ADFLAGS(al,adn), AINVALID);
570 /* Note IDREF; indicate if ID exists. */
571 if ((rp = idref(pt)) != 0)
572 rp->msg = saverr(69, &pcbstag, ADNAME(al,adn), pt+1);
575 /* ANMGET: Locate an attribute name in an attribute definition list.
578 int adsz; /* Size of list. */
579 UNCH *nm; /* Value to be found (with length byte). */
581 int adn = 0; /* Position in list. */
583 while (++adn <= adsz && ustrcmp(nm+1, ADNAME(al,adn))) {
584 if (BITON(ADFLAGS(al,adn), AGROUP)) adn += (int)ADNUM(al,adn);
586 return (adn > adsz) ? 0 : adn;
588 /* ANTVGET: Find the position of a name token value in an attribute list.
589 Return the position of the attribute definition, or zero
590 if none was found. Set pp to the value, if non-NULL.
592 int antvget(adsz, nm, pp)
593 int adsz; /* Size of list. */
594 UNCH *nm; /* Value to be found (with length byte). */
595 UNCH **pp; /* Store value here */
597 int adn = 0; /* Position in list. */
599 while (++adn<=adsz) {
600 /* Test only name group members. */
601 if (BITON(ADFLAGS(al,adn), AGROUP)) {
602 int advn; /* Position of value in sub-list. */
603 if ((advn = amemget(&al[adn], (int)ADNUM(al,adn), nm))!=0) {
605 *pp = al[adn+advn].adname;
608 adn += (int)ADNUM(al,adn);
613 /* AMEMGET: Get the position of a member in an attribute name token group.
614 Returns the position, or zero if not found.
615 The length byte is ignored in the comparison so that final
616 form tokens from ATTVAL can be compared to group members.
618 int amemget(anmtgrp, adsz, nm)
619 struct ad anmtgrp[]; /* Name token group. */
620 int adsz; /* Size of group. */
621 UNCH *nm; /* Name to be found (with length byte). */
623 int adn = 0; /* Position in group. */
625 while ( ++adn<=adsz && ustrncmp(nm+1, anmtgrp[adn].adname+1, (UNS)*nm-1)) ;
626 return (adn>adsz) ? 0 : adn;
628 /* VALLEN: Returns the length of an attribute value for capacity
629 calculations. Normally, the length is NORMSEP plus the number
630 of characters. For tokenized lists, it is NORMSEP,
631 plus the number of characters in the tokens, plus
632 NORMSEP for each token.
633 ACHARS and tokenized lists don't have a length byte.
636 UNS vallen(type, num, def)
637 int type; /* ADTYPE(al,adn) */
638 int num; /* ADNUM(al,adn) */
639 UNCH *def; /* ADVAL(al,adn) */
642 return ustrlen(def) + NORMSEP;
644 return *def - 2 + NORMSEP;
645 return ustrlen(def) + num * (NORMSEP - 1) + NORMSEP;
647 /* PARSEGRP: Parse GI names, get their etds, and form an array of pointers
648 to them. The array is terminated by a NULL pointer.
649 The number of pointers (including the NULL) is returned.
650 The grp buffer must have room for GRPCNT+1 etds.
652 UNS parsegrp(grp, pcb, tbuf)
653 struct etd *grp[]; /* Buffer for building the group. */
654 struct parse *pcb; /* Current parse control block. */
657 int grpcnt = 0; /* Number of etds in the group. */
659 int essv = es; /* Entity stack level when grp started. */
661 while (parse(pcb)!=GRPE && grpcnt<GRPCNT) {
662 switch (pcb->action) {
663 case NAS_: /* GI name: get its etd for the group. */
664 grp[grpcnt] = etddef(parsenm(tbuf, NAMECASE));
665 for (i = 0; i < grpcnt; i++)
666 if (grp[i] == grp[grpcnt]) {
667 mderr(98, ntoa(grpcnt + 1), grp[grpcnt]->etdgi + 1);
674 case EE_: /* Entity ended (correctly or incorrectly). */
675 if (es<essv) {synerr(37, pcb); essv = es;}
678 case PIE_: /* PI entity reference (invalid). */
679 entpisw = 0; /* Reset PI entity indicator. */
688 grp[grpcnt++] = 0; /* NULL pointer indicates end of group. */
689 if (es!=essv) synerr(37, pcb);
690 return grpcnt; /* Return number of ptrs in group. */
692 /* PARSNGRP: Parse notation names, get their dcncbs, and form an array of
693 pointers to them. The array is terminated by a NULL pointer.
694 The number of pointers (including the NULL) is returned.
695 The grp buffer must have room for GRPCNT+1 members.
697 UNS parsngrp(grp, pcb, tbuf)
698 struct dcncb *grp[]; /* Buffer for building the group. */
699 struct parse *pcb; /* Current parse control block. */
702 int grpcnt = 0; /* Number of members in the group. */
704 int essv = es; /* Entity stack level when grp started. */
706 while (parse(pcb)!=GRPE && grpcnt<GRPCNT) {
707 switch (pcb->action) {
708 case NAS_: /* Member name: get its control block. */
709 grp[grpcnt] = dcndef(parsenm(tbuf, NAMECASE));
710 for (i = 0; i < grpcnt; i++)
711 if (grp[i] == grp[grpcnt]) {
712 mderr(98, ntoa(grpcnt + 1), grp[grpcnt]->ename + 1);
719 case EE_: /* Entity ended (correctly or incorrectly). */
720 if (es<essv) {synerr(37, pcb); essv = es;}
723 case PIE_: /* PI entity reference (invalid). */
724 entpisw = 0; /* Reset PI entity indicator. */
733 grp[grpcnt++] = 0; /* NULL pointer indicates end of group. */
734 if (es!=essv) synerr(37, pcb);
735 return grpcnt; /* Return number of ptrs in group. */
737 /* COPYGRP: Allocate storage for a group and copy the group into it.
739 PETD *copygrp(pg, grpsz)
740 PETD pg[]; /* Pointer to a group (array of etd ptrs). */
741 UNS grpsz; /* Number of ptrs in grp, including final NULL. */
743 UNS glen; /* Group length in characters. */
744 PETD *gnm; /* Ptr to permanent name group. */
746 if (pg==0) return (PETD *)0;
747 glen = grpsz * sizeof(struct etd *);
748 memcpy( (UNIV)(gnm = (struct etd **)rmalloc(glen)) , (UNIV)pg, glen );
751 /* INGRP: Locate an etd in a name group and return its index+1 (or zero
755 PETD pg[]; /* Array of pointers to etds. */
756 PETD ketd; /* Pointer to etd to be found in group. */
758 int i = 0; /* Array index. */
760 while (pg[i]) if (pg[i++]==ketd) return i;
763 /* PARSELIT: Parse a delimited string and collect it into a token.
764 Caller supplies buffer, which must be 1 longer than
765 maximum string allowed.
766 Caller also supplies character that delimits the string.
767 TODO: Return 1 if CDATA, SDATA or NONSGML occurred.
769 #ifdef USE_PROTOTYPES
770 VOID parselit(UNCH *tbuf, struct parse *pcb, UNS maxlen, UNCH del)
772 VOID parselit(tbuf, pcb, maxlen, del)
773 UNCH *tbuf; /* Work area for tokenization (parmlen+1). */
774 struct parse *pcb; /* Current parse control block. */
775 UNS maxlen; /* Maximum length of token. */
776 UNCH del; /* Literal delimiter: LIT LITA PIC EOS */
779 UNCH *pt = tbuf; /* Current pointer into tbuf. */
780 UNCH lexsv = lexlms[del];/* Saved lexlms value of delimiter. */
781 int essv = es; /* Entity stack level when literal started. */
782 UNCH datadel; /* Delimiter for CDATA/SDATA entity. */
783 int parmlen = (int)maxlen; /* Working limit (to be decremented). */
785 lexlms[del] = lex.l.litc; /* Set delimiter to act as literal close. */
787 switch (parse(pcb)) {
788 case LP2_: /* Move 2nd char back to buffer; redo prev.*/
790 case LPR_: /* Move previous char to buffer; REPEATCC; */
792 case MLA_: /* Move character to buffer. */
793 *pt++ = *FPOS; --parmlen;
796 case FUN_: /* Function char found; replace with space.*/
797 *pt++ = ' '; --parmlen;
800 case RSM_: /* Record start: ccnt=0; ++rcnt.*/
801 ++RCNT; CTRSET(RSCC); *pt++ = *FPOS; --parmlen;
804 case ERX_: /* Entity reference: cancel LITC delim. */
805 case PEX_: /* Parameter entity ref: cancel LITC delim.*/
814 /* If back at top level, re-enable the LITC delimiter. */
815 if (es==essv) lexlms[del] = lex.l.litc;
818 case MLE_: /* Char not allowed in minimum literal. */
822 case DEF_: /* Data entity: add it to buffer. */
823 if (pcb == &pcblitt) {
824 int parmlensv = parmlen;
826 parmlen = tokdata(pt, parmlen);
829 pt += parmlensv - parmlen;
832 if ((parmlen -= (int)datalen+2)<0) {entdatsw = 0; break;}
834 BITON(entdatsw, CDECONT) ? DELCDATA : DELSDATA;
836 memcpy( pt , data, datalen );
841 case NON_: /* Non-SGML char (delimited and shifted). */
842 if ((parmlen -= 2)<0) break;
843 memcpy( pt , nonchbuf, 2 );
847 case RPR_: /* Remove character from buffer. */
858 } while (parmlen>=0 && pcb->action!=TER_);
860 if (parmlen<0) {--pt; sgmlerr(134, pcb, ntoa((int)maxlen),(UNCH *)0); REPEATCC;}
861 datalen = (UNS)(pt-tbuf);/* To return PI string to text processor. */
863 lexlms[del] = lexsv; /* Restore normal delimiter handling. */
864 if (es!=essv) synerr(37, pcb);
868 /* Handle a data entity in a tokenized attribute value literal.
869 Parmlen is amount of space left. Return new parmlen. If there's not
870 enough space return -1, and copy up to parmlen + 1 characters. */
872 int tokdata(pt, parmlen)
876 int skip = (pcblitt.newstate == 0);
879 for (i = 0; parmlen >= 0 && i < datalen; i++) {
894 if (data[i] == DELNONCH) {
895 assert(i + 1 < datalen);
896 if ((parmlen -= 2) < 0)
910 pcblitt.newstate = skip ? 0 : pcblittda;
915 /* PARSEMD: Parser for markup declarations.
916 It returns a token each time it is called.
919 int parsemd(pt, namecase, lpcb, tokenlen)
920 UNCH *pt; /* Token buffer: >=tokenlen+2. */
921 int namecase; /* Case translation: ENTCASE NAMECASE AVALCASE. */
922 struct parse *lpcb; /* Parse control block for literal parse. */
923 UNS tokenlen; /* Max length of expected token: NAMELEN LITLEN */
925 struct parse *pcb; /* Current parse control block. */
927 pcb = (lpcb) ? &pcbmd : &pcbmdc; /* If no literal pcb, dcl is comment. */
929 doparse: while (parse(pcb)==EE_)
930 if (es<mdessv) {synerr(37, pcb); mdessv = es;}
931 if (pcb->action==PIE_) { /* PI entity reference not allowed. */
932 entpisw = 0; /* Reset PI entity indicator. */
936 ++parmno; /* Increment parameter counter. */
937 switch (pcb->action) {
938 case CDR: /* COM[1] (MINUS) occurred previously. */
940 return (int)pcb->action;
941 case LIT: /* Literal: CDATA with LIT delimiter. */
942 parselit(pt, lpcb, tokenlen, lex.d.lit);
943 return (int)pcb->action;
944 case LITE: /* Literal: CDATA with LITA delimiter. */
945 parselit(pt, lpcb, tokenlen, lex.d.lita);
946 return((int)(pcb->action = LIT));
947 case RNS: /* Reserved name started (after RNI). */
948 parsenm(pt, NAMECASE);
949 return (int)pcb->action;
950 case NAS: /* Name started. */
951 if (namecase!=AVALCASE) {
952 parsenm(pt, namecase);
953 return (int)pcb->action;
955 /* Treat attribute value as name character string. */
956 case NMT: /* Name token string. */
957 parsetkn(pt, NMC, (int)tokenlen); /* Get undelimited value. */
958 return (int)pcb->action;
959 case NUM: /* Number or number token string. */
960 parsetkn(pt, (UNCH)((int)tokenlen<=NAMELEN ? NU:NMC), (int)tokenlen);
961 return (int)pcb->action;
964 return (pcb->action = PEN);
968 default: /* End of declaration. */
969 return (int)pcb->action; /* EMD GRPS MGRP PEN PGRP */
972 /* PARSEMOD: If the declared content was a keyword, the token count is zero
973 and it is only necessary to save the type. Otherwise,
974 collect the outermost token count and model type bytes for a model.
975 The count includes tokens found in nested groups also.
976 After building the model, parse for its occurrence indicator.
978 struct thdr *parsemod(dctype)
979 int dctype; /* Content type (0=model). */
981 gbuf[0].ttype = (UNCH)dctype; /* Initialize content flags byte. */
982 if (dctype) {gbuf[0].tu.tnum = 0; return gbuf;} /* Return if not model. */
984 gbuf[0].tu.tnum = 0; /* Don't count 1st group or model header. */
985 gbuf[1].ttype = 0; /* Initialize 1st group type ... */
986 gbuf[1].tu.tnum = 0; /* and count. */
987 grplvl = 1; /* Content model is 1st level group. */
988 pcbgrcm.newstate = 0; /* Go parse the model group. */
989 /* Empty group is trapped during syntax parse; other errors return NULL. */
990 if (!parsegcm(&pcbgrcm, &gbuf[1], &gbuf[0])) return (struct thdr *)0;
991 parse(&pcbgrcs); /* Get the model suffix, if there is one. */
992 switch(pcbgrcs.action) {
993 case OPT: /* OPT occurrence indicator for model. */
994 SET(gbuf[1].ttype, TOPT|TXOPT);
996 case REP: /* REP occurrence indicator for model. */
997 SET(gbuf[1].ttype, TREP|TXREP);
999 case OREP: /* OREP occurrence indicator for model. */
1000 SET(gbuf[1].ttype, TOREP|TXOREP);
1002 default: /* RCR_: Repeat char and return. */
1005 if (sw.swambig) ambig(); /* Check content model for ambiguity. */
1008 /* PARSEGCM: Collect token headers (struct thdr) into a group (array).
1009 An etd is defined for each GI (if none exists) and its pointer is
1010 stored in the header. The function is called recursively.
1012 struct thdr *parsegcm(pcb, pgh, gbuf)
1013 struct parse *pcb; /* Current parse control block. */
1014 struct thdr *pgh; /* Current group header in group buffer. */
1015 struct thdr *gbuf; /* Header for outermost group (model). */
1017 #define MCON gbuf->ttype /* Model type (content attributes). */
1018 struct thdr *pg=pgh; /* Current group token. */
1019 struct thdr *pgsv=pgh; /* Saved current token for occ indicator. */
1020 int optcnt = 0; /* Count of optional tokens in group. */
1021 int essv = es; /* Entity stack level when grp started. */
1023 while (gbuf->tu.tnum<=GRPGTCNT && pgh->tu.tnum<=GRPCNT && parse(pcb)!=GRPE)
1024 switch (pcb->action) {
1026 case NAS_: /* GI name: get its etd and store it. */
1027 ++gbuf->tu.tnum; ++pgh->tu.tnum;
1028 (pgsv = ++pg)->ttype = TTETD;
1029 pg->tu.thetd = etddef(parsenm(tbuf, NAMECASE));
1033 case RNS_: /* Reserved name started (#PCDATA). */
1034 parsenm(tbuf, NAMECASE);
1035 if (ustrcmp(tbuf+1, key[KPCDATA])) {
1036 mderr(116, ntoa(gbuf->tu.tnum), tbuf+1);
1037 return (struct thdr *)0;
1039 /* If #PCDATA is the first non-group token, model is a phrase. */
1040 if (!MCON) SET(MCON, MPHRASE);
1041 case DTAG: /* Data tag template ignored; treat as #PCDATA. */
1042 if (pcb->action==DTAG) SET(pgh->ttype, TTSEQ); /* DTAG is SEQ grp. */
1043 ++gbuf->tu.tnum; ++pgh->tu.tnum;
1044 (++pg)->ttype = TTCHARS+TOREP;/* #PCDATA is OPT and REP. */
1045 pg->tu.thetd = ETDCDATA;
1046 ++optcnt; /* Ct opt tokens to see if grp is opt.*/
1050 case GRP_: /* Group started. */
1051 ++gbuf->tu.tnum; ++pgh->tu.tnum;
1052 (pgsv = ++pg)->ttype = 0; /* Type will be set by connector. */
1053 pg->tu.tnum = 0; /* Group has number instead of etd. */
1054 if (++grplvl>GRPLVL) {
1055 mderr(115, ntoa(gbuf->tu.tnum), (UNCH *)0);
1056 return (struct thdr *)0;
1058 pg = parsegcm(pcb, pg, gbuf);
1059 if (!pg) return (struct thdr *)0;
1060 if (GET(pgsv->ttype, TOPT)) ++optcnt; /* Indicate nested opt grp. */
1064 case OREP: /* OREP occurrence indicator for current token.*/
1065 SET(pgsv->ttype, TREP|TXREP);
1066 /* Now treat like OPT. */
1067 case OPT: /* OPT occurrence indicator for current token. */
1068 SET(pgsv->ttype, TXOPT);
1069 if (GET(pgsv->ttype, TOPT)) continue; /* Exit if nested opt grp. */
1070 SET(pgsv->ttype, TOPT);
1071 ++optcnt; /* Count opt tokens to see if grp is optional. */
1073 case REP: /* REP occurrence indicator for current token. */
1074 SET(pgsv->ttype, TREP|TXREP);
1077 case OR: /* OR connector found. */
1078 if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTOR);
1079 else if (GET(pgh->ttype, TTAND)!=TTOR)
1080 mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);
1082 case AND: /* AND connector found. */
1083 if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTAND);
1084 else if (GET(pgh->ttype, TTAND)!=TTAND)
1085 mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);
1087 case SEQ: /* SEQ connector found. */
1088 if BITOFF(pgh->ttype, TTAND) SET(pgh->ttype, TTSEQ);
1089 else if (GET(pgh->ttype, TTAND)!=TTSEQ)
1090 mderr(55, ntoa(gbuf->tu.tnum), (UNCH *)0);
1093 case EE_: /* Entity ended (correctly or incorrectly). */
1094 if (es<essv) {synerr(37, pcb); essv = es;}
1097 case PIE_: /* PI entity reference (not permitted). */
1098 entpisw = 0; /* Reset PI entity indicator. */
1102 default: /* Syntax errors return in disgrace. */
1104 return (struct thdr *)0;
1106 if (pgh->tu.tnum>GRPCNT) {
1107 mderr(113, ntoa(gbuf->tu.tnum), (UNCH *)0);
1108 return (struct thdr *)0;
1110 if (gbuf->tu.tnum>GRPGTCNT) {
1111 mderr(114, ntoa(gbuf->tu.tnum), (UNCH *)0);
1112 return (struct thdr *)0;
1114 if (pgh->tu.tnum==1) SET(pgh->ttype, TTSEQ); /* Unit grp is SEQ. */
1115 /* An optional token in an OR group makes the group optional. */
1116 if (GET(pgh->ttype, TTMASK)==TTOR && optcnt) SET(pgh->ttype, TOPT);
1117 /* If all tokens in any group are optional, so is the group. */
1118 if (pgh->tu.tnum<=optcnt) SET(pgh->ttype, TOPT);
1120 if (es!=essv) synerr(37, pcb);
1121 return pg; /* Return pointer to GRPS token. */
1123 /* PARSENM: Parser for SGML names, which can be translated with LEXTRAN.
1124 The input is read from the entity stack. CC is 1st char of name.
1125 Returns a pointer to the parsed name.
1127 UNCH *parsenm(tbuf, nc)
1128 UNCH *tbuf; /* Buffer for name: >=NAMELEN+2. */
1129 int nc; /* Namecase translation: 1=yes; 0=no. */
1131 UNCH len; /* Length of name (incl EOS & length byte). */
1133 *(tbuf + (len = 1) ) = nc ? lextran[*FPOS] : *FPOS;
1134 while ((NEWCC, (int)lextoke[*FPOS]>=NMC) && (len<NAMELEN)) {
1135 TRACETKN(NMC, lextoke);
1136 if (lextoke[*(tbuf + ++len) = (nc ? lextran[*FPOS] : *FPOS)]==EOB) {
1141 REPEATCC; /* Put back the non-token character. */
1142 *(tbuf + ++len) = EOS; /* Terminate name with standard EOS. */
1143 *tbuf = ++len; /* Store length ahead of name. */
1146 /* PARSETKN: Parser for start-tag attribute value tokens.
1147 First character of token is already in *FPOS.
1148 Returns a pointer to the parsed token.
1149 Parsed token has EOS but no length byte.
1151 #ifdef USE_PROTOTYPES
1152 UNCH *parsetkn(UNCH *tbuf, UNCH scope, int maxlen)
1154 UNCH *parsetkn(tbuf, scope, maxlen)
1155 UNCH *tbuf; /* Buffer for token: >=maxlen+1. */
1156 UNCH scope; /* Minimum lexical class allowed. */
1157 int maxlen; /* Maximum length of a token. */
1162 while (i < maxlen) {
1164 if (lextoke[*FPOS] < scope) {
1168 TRACETKN(scope, lextoke);
1169 if (*FPOS == EOBCHAR)
1177 /* PARSESEQ: Parser for blank sequences (i.e., space and TAB characters ).
1178 First character of sequence is already in *FPOS.
1180 VOID parseseq(tbuf, maxlen)
1181 UNCH *tbuf; /* Buffer for storing found sequence. */
1182 int maxlen; /* Maximum length of a blank sequence. */
1188 if (*FPOS == EOBCHAR) {
1192 if ((lextoke[*FPOS] != SEP && *FPOS != SPCCHAR)
1193 || datalen >= maxlen)
1195 tbuf[datalen++] = *FPOS;
1196 TRACETKN(SEP, lextoke);
1199 /* S2VALNM: Parser for attribute values that are tokenized like names.
1200 The input is read from a string (hence S ("string") 2 ("to") VALNM).
1201 It stops at the first bad character.
1202 Returns a pointer to the created name.
1204 #ifdef USE_PROTOTYPES
1205 UNCH *s2valnm(UNCH *nm, UNCH *s, UNCH scope, int translate)
1207 UNCH *s2valnm(nm, s, scope, translate)
1208 UNCH *nm; /* Name to be created. */
1209 UNCH *s; /* Source string to be parsed as name. */
1210 UNCH scope; /* Minimum lexical class allowed. */
1211 int translate; /* Namecase translation: 1=yes; 0=no. */
1214 UNCH len = 0; /* Length of name (incl EOS and length). */
1216 for (; (int)lextoke[*s] >= scope && len < NAMELEN; s++)
1217 nm[++len] = translate ? lextran[*s] : *s;
1218 nm[++len] = EOS; /* Terminate name with standard EOS. */
1219 *nm = ++len; /* Store length ahead of name. */
1222 /* PARSEVAL: Parser for attribute values.
1223 The input is read from a string and tokenized in a buffer.
1224 The input is terminated by EOS.
1225 Each token is preceded by its actual length; there is no EOS.
1226 If an error occurs while parsing, or
1227 if a token doesn't conform, set the token count to 0 to show that
1228 value was not tokenized and return the error code.
1229 After successful parse, return buffer length and 0 error code.
1230 The number of tokens found is set in external variable tokencnt.
1232 int parseval(s, atype, tbuf)
1233 UNCH *s; /* Source string to be parsed as token list. */
1234 UNS atype; /* Type of token list expected. */
1235 UNCH *tbuf; /* Work area for tokenization. */
1240 pcbval.newstate = 0; tokencnt = 0;
1243 pcbval.input = lextoke[*s];
1244 pcbval.state = pcbval.newstate;
1245 pcbval.newstate = (*(pcbval.ptab + pcbval.state)) [pcbval.input];
1246 pcbval.action = (*(pcbval.ptab + pcbval.state+1)) [pcbval.input];
1247 TRACEVAL(&pcbval, atype, s, tokencnt);
1248 if (pcbval.action != NOPA)
1254 switch (pcbval.action) {
1255 case INVA: /* Invalid character; terminate parse. */
1256 if (*s == '\0') goto alldone; /* Normal termination. */
1257 tokencnt = 0; /* Value was not tokenized. */
1259 case LENA: /* Length limit of token exceeded; end parse. */
1260 tokencnt = 0; /* Value was not tokenized. */
1262 default: /* Token begun: NUMA, NASA, or NMTA. */
1266 ++tokencnt; /* One token per iteration. */
1269 if (tokencnt>1) {tokencnt = 0; return(16);}
1271 if (pcbval.action!=NASA) {tokencnt = 0; return(17);}
1272 s2valnm(pt, s, NMC, ENTCASE);
1279 if (tokencnt>1) {tokencnt = 0; return(16);}
1282 if (pcbval.action!=NASA) {tokencnt = 0; return(17);}
1283 s2valnm(pt, s, NMC, NAMECASE);
1288 if (tokencnt>1) {tokencnt = 0; return(16);}
1290 /* No test needed because NMTA, NUMA and NASA are all valid. */
1291 s2valnm(pt, s, NMC, NAMECASE);
1295 if (tokencnt>1) {tokencnt = 0; return(16);}
1297 if (pcbval.action!=NUMA) {tokencnt = 0; return(17);}
1298 s2valnm(pt, s, NU, NAMECASE);
1299 t = lextoke[s[*pt - 2]];
1300 if (t == NMS || t == NMC) {tokencnt = 0; return(17);}
1304 if (tokencnt>1) {tokencnt = 0; return(16);}
1306 if (pcbval.action!=NUMA) {tokencnt = 0; return(17);}
1307 s2valnm(pt, s, NMC, NAMECASE);
1318 if (atype < ATKNLIST)
1319 *tbuf += 2; /* include length and EOS */
1325 c-continued-statement-offset: 5