1:
37:
38:
39: package ;
40:
41: import ;
42: import ;
43: import ;
44: import ;
45: import ;
46: import ;
47: import ;
48: import ;
49:
50: import ;
51: import ;
52:
53: import ;
54: import ;
55: import ;
56: import ;
57:
58: import ;
59: import ;
60: import ;
61: import ;
62: import ;
63: import ;
64: import ;
65: import ;
66:
67:
90: public class Parser
91: extends ReaderTokenizer
92: implements DTDConstants
93: {
94:
97: public Token hTag = new Token();
98:
99:
102: protected DTD dtd;
103:
104:
110: protected boolean strict;
111:
112:
115: protected int preformatted = 0;
116:
117:
121: private Set documentTags =
122: new TreeSet(new Comparator()
123: {
124: public int compare(Object a, Object b)
125: {
126: return ((String) a).compareToIgnoreCase((String) b);
127: }
128: }
129: );
130:
131:
134: private StringBuffer buffer = new StringBuffer();
135:
136:
139: private StringBuffer title = new StringBuffer();
140:
141:
144: private Token t;
145:
146:
150: private boolean titleHandled;
151:
152:
156: private boolean titleOpen;
157:
158:
162: htmlAttributeSet attributes =
163: htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
164:
165:
169: private htmlValidator validator;
170:
171:
175: private parameterDefaulter defaulter;
176:
177:
180: private textPreProcessor textProcessor = new textPreProcessor();
181:
182:
190: public Parser(DTD a_dtd)
191: {
192: if (a_dtd == null)
193: dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
194: else
195: dtd = a_dtd;
196:
197: defaulter = new parameterDefaulter(dtd);
198:
199: validator =
200: new htmlValidator(dtd)
201: {
202:
207: protected void s_error(String msg)
208: {
209: error(msg);
210: }
211:
212:
220: protected void handleSupposedEndTag(Element tElement)
221: {
222:
223:
224:
225: TagElement tag = makeTag(tElement, true);
226: _handleEndTag_remaining(tag);
227: }
228:
229:
238: protected void handleSupposedStartTag(Element tElement)
239: {
240: TagElement tag = makeTag(tElement, true);
241: htmlAttributeSet were = attributes;
242: attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
243: _handleStartTag(tag);
244: attributes = were;
245: }
246: };
247: }
248:
249:
253: public htmlAttributeSet getAttributes()
254: {
255: return attributes;
256: }
257:
258:
262: public void error(String msg)
263: {
264: error(msg, getTokenAhead());
265: }
266:
267: public void error(String msg, Token atToken)
268: {
269: if (atToken != null)
270: handleError(atToken.where.beginLine,
271: msg + ": line " + atToken.where.beginLine +
272: ", absolute pos " + atToken.where.startPosition
273: );
274: else
275: handleError(0, msg);
276: }
277:
278:
282: public void error(String msg, String invalid)
283: {
284: error(msg + ": '" + invalid + "'");
285: }
286:
287:
291: public void error(String parm1, String parm2, String parm3)
292: {
293: error(parm1 + " " + parm2 + " " + parm3);
294: }
295:
296:
300: public void error(String parm1, String parm2, String parm3, String parm4)
301: {
302: error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
303: }
304:
305: public void flushAttributes()
306: {
307: }
308:
309:
315: public synchronized void parse(Reader reader)
316: throws IOException
317: {
318: reset(reader);
319: restart();
320: try
321: {
322: parseDocument();
323: validator.closeAll();
324: }
325: catch (ParseException ex)
326: {
327: if (ex != null)
328: {
329: error("Unable to continue parsing the document", ex.getMessage());
330:
331: Throwable cause = ex.getCause();
332: if (cause instanceof IOException)
333: throw (IOException) cause;
334: }
335: }
336: }
337:
338:
343: public String parseDTDMarkup()
344: throws IOException
345: {
346: return null;
347: }
348:
349:
366: public boolean parseMarkupDeclarations(StringBuffer strBuff)
367: throws IOException
368: {
369: return false;
370: }
371:
372:
375: protected int getCurrentLine()
376: {
377: return hTag.where.beginLine;
378: }
379:
380:
388: protected void CDATA(boolean clearBuffer)
389: throws ParseException
390: {
391: Token start = hTag = getTokenAhead();
392:
393: if (clearBuffer)
394: buffer.setLength(0);
395:
396:
397: if (start.kind == EOF)
398: return;
399:
400: read:
401: while (true)
402: {
403: t = getTokenAhead();
404: if (t.kind == EOF)
405: {
406: error("unexpected eof", t);
407: break read;
408: }
409: else if (t.kind == BEGIN)
410: break read;
411: else if (t.kind == Constants.ENTITY)
412: {
413: resolveAndAppendEntity(t);
414: getNextToken();
415: }
416: else
417: {
418: append(t);
419: getNextToken();
420: }
421: }
422: hTag = new Token(start, getTokenAhead(0));
423: if (buffer.length() != 0)
424: _handleText();
425: }
426:
427:
432: protected void Comment()
433: throws ParseException
434: {
435: buffer.setLength(0);
436:
437: Token start = hTag = mustBe(BEGIN);
438: optional(WS);
439: mustBe(EXCLAMATION);
440: optional(WS);
441: mustBe(DOUBLE_DASH);
442:
443: Token t;
444: Token last;
445:
446: comment:
447: while (true)
448: {
449: t = getTokenAhead();
450: if (t.kind == EOF)
451: {
452: handleEOFInComment();
453: last = t;
454: break comment;
455: }
456: else if (COMMENT_END.matches(this))
457: {
458: mustBe(DOUBLE_DASH);
459: optional(WS);
460: last = mustBe(END);
461: break comment;
462: }
463: else if (COMMENT_TRIPLEDASH_END.matches(this))
464: {
465: mustBe(DOUBLE_DASH);
466: t = mustBe(NUMTOKEN);
467: if (t.getImage().equals("-"))
468: {
469: append(t);
470: last = mustBe(END);
471: break comment;
472: }
473: else
474: {
475: buffer.append("--");
476: append(t);
477: t = getTokenAhead();
478: }
479: }
480: else
481:
482: if ((t.getImage().endsWith("--")) &&
483: (
484: getTokenAhead(1).kind == END ||
485: (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
486: )
487: )
488: {
489: buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
490:
491:
492: last = mustBe(t.kind);
493: break comment;
494: }
495: else
496: append(t);
497: mustBe(t.kind);
498: }
499: hTag = new Token(start, last);
500: handleComment();
501: }
502:
503:
507: protected void Script()
508: throws ParseException
509: {
510: Token name;
511:
512: Token start = hTag = mustBe(BEGIN);
513: optional(WS);
514:
515: name = mustBe(SCRIPT);
516:
517: optional(WS);
518:
519: restOfTag(false, name, start);
520:
521: buffer.setLength(0);
522:
523: script:
524: while (!SCRIPT_CLOSE.matches(this))
525: {
526: append(getNextToken());
527: }
528:
529: consume(SCRIPT_CLOSE);
530:
531: _handleText();
532:
533: endTag(false);
534: _handleEndTag(makeTagElement(name.getImage(), false));
535: }
536:
537:
540: protected void Sgml()
541: throws ParseException
542: {
543: if (COMMENT_OPEN.matches(this))
544: Comment();
545: else
546: {
547: Token start = hTag = mustBe(BEGIN);
548: optional(WS);
549: mustBe(EXCLAMATION);
550:
551: buffer.setLength(0);
552: read:
553: while (true)
554: {
555: t = getNextToken();
556: if (t.kind == Constants.ENTITY)
557: {
558: resolveAndAppendEntity(t);
559: }
560: else if (t.kind == EOF)
561: {
562: error("unexpected eof", t);
563: break read;
564: }
565: else if (t.kind == END)
566: break read;
567: else
568: append(t);
569: }
570:
571: try
572: {
573: parseMarkupDeclarations(buffer);
574: }
575: catch (IOException ex)
576: {
577: error("Unable to parse SGML insertion: '" + buffer + "'",
578: new Token(start, t)
579: );
580: }
581: }
582: }
583:
584:
588: protected void Style()
589: throws ParseException
590: {
591: Token name;
592:
593: Token start = hTag = mustBe(BEGIN);
594: optional(WS);
595:
596: name = mustBe(STYLE);
597:
598: optional(WS);
599:
600: restOfTag(false, name, start);
601:
602: buffer.setLength(0);
603:
604: style:
605: while (!STYLE_CLOSE.matches(this))
606: {
607: append(getNextToken());
608: }
609:
610: consume(STYLE_CLOSE);
611:
612: _handleText();
613:
614: endTag(false);
615: _handleEndTag(makeTagElement(name.getImage(), false));
616: }
617:
618:
621: protected void Tag()
622: throws ParseException
623: {
624: mark(true);
625:
626: boolean closing = false;
627: Token name;
628: Token start = hTag = mustBe(BEGIN);
629:
630: optional(WS);
631: name = getNextToken();
632: optional(WS);
633:
634: if (name.kind == SLASH)
635: {
636: closing = true;
637: name = getNextToken();
638: }
639:
640: restOfTag(closing, name, start);
641: }
642:
643:
652: protected void _handleText()
653: {
654: char[] text;
655:
656: if (preformatted > 0)
657: text = textProcessor.preprocessPreformatted(buffer);
658: else
659: text = textProcessor.preprocess(buffer);
660:
661: if (text != null && text.length > 0)
662: {
663: TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
664: attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
665: _handleEmptyTag(pcdata);
666:
667: handleText(text);
668: if (titleOpen)
669: title.append(text);
670: }
671: }
672:
673:
677: protected final void append(Token t)
678: {
679: if (t.kind != EOF)
680: t.appendTo(buffer);
681: }
682:
683:
687: protected final void consume(pattern p)
688: {
689: node n;
690: for (int i = 0; i < p.nodes.length; i++)
691: {
692: n = p.nodes [ i ];
693: if (n.optional)
694: optional(n.kind);
695: else
696: mustBe(n.kind);
697: }
698: }
699:
700:
709: protected void endTag(boolean omitted)
710: {
711: }
712:
713:
717: protected void handleComment(char[] comment)
718: {
719: }
720:
721:
727: protected void handleEOFInComment()
728: {
729: error("Unclosed comment");
730: }
731:
732:
739: protected void handleEmptyTag(TagElement tag)
740: throws javax.swing.text.ChangedCharSetException
741: {
742: }
743:
744:
750: protected void handleEndTag(TagElement tag)
751: {
752: }
753:
754:
755: protected void handleError(int line, String message)
756: {
757: }
758:
759:
765: protected void handleStartTag(TagElement tag)
766: {
767: }
768:
769:
783: protected void handleText(char[] text)
784: {
785: }
786:
787:
794: protected void handleTitle(char[] title)
795: {
796: }
797:
798:
803: protected TagElement makeTag(Element element)
804: {
805: return makeTag(element, false);
806: }
807:
808:
816: protected TagElement makeTag(Element element, boolean isSupposed)
817: {
818: return new TagElement(element, isSupposed);
819: }
820:
821:
826: protected void markFirstTime(Element element)
827: {
828: }
829:
830:
834: protected Token mustBe(int kind)
835: {
836: if (getTokenAhead().kind == kind)
837: return getNextToken();
838: else
839: {
840: String ei = "";
841: if (kind < 1000)
842: ei = " ('" + (char) kind + "') ";
843: throw new AssertionError("The token of kind " + kind + ei +
844: " MUST be here,"
845: );
846: }
847: }
848:
849:
858: protected void noValueAttribute(String element, String attribute)
859: {
860: Object value = HTML.NULL_ATTRIBUTE_VALUE;
861:
862: Element e = (Element) dtd.elementHash.get(element.toLowerCase());
863: if (e != null)
864: {
865: AttributeList attr = e.getAttribute(attribute);
866: if (attr != null)
867: {
868: Vector values = attr.values;
869: if (values != null && values.size() == 1)
870: value = values.get(0);
871: }
872: }
873: attributes.addAttribute(attribute, value);
874: }
875:
876:
880: protected Token optional(int kind)
881: {
882: if (getTokenAhead().kind == kind)
883: return getNextToken();
884: else
885: return null;
886: }
887:
888:
889: protected void parseDocument()
890: throws ParseException
891: {
892: while (getTokenAhead().kind != EOF)
893: {
894: advanced = false;
895: if (TAG.matches(this))
896: Tag();
897: else if (COMMENT_OPEN.matches(this))
898: Comment();
899: else if (STYLE_OPEN.matches(this))
900: Style();
901: else if (SCRIPT_OPEN.matches(this))
902: Script();
903: else if (SGML.matches(this))
904: Sgml();
905: else
906: CDATA(true);
907:
908:
909: if (!advanced)
910: {
911: Token wrong = getNextToken();
912: error("unexpected '" + wrong.getImage() + "'", wrong);
913: buffer.setLength(0);
914: buffer.append(wrong.getImage());
915: _handleText();
916: }
917: }
918: }
919:
920:
925: protected void readAttributes(String element)
926: {
927: Token name;
928: Token value;
929: Token next;
930: String attrValue;
931:
932: attributes = new htmlAttributeSet();
933:
934: optional(WS);
935:
936: attributeReading:
937: while (getTokenAhead().kind == NUMTOKEN)
938: {
939: name = getNextToken();
940: optional(WS);
941:
942: next = getTokenAhead();
943: if (next.kind == EQ)
944: {
945: mustBe(EQ);
946: optional(WS);
947:
948: next = getNextToken();
949:
950: switch (next.kind)
951: {
952: case QUOT :
953:
954:
955: buffer.setLength(0);
956: readTillTokenE(QUOT);
957: attrValue = buffer.toString();
958: break;
959:
960: case AP :
961:
962:
963: buffer.setLength(0);
964: readTillTokenE(AP);
965: attrValue = buffer.toString();
966: break;
967:
968:
969: case NUMTOKEN :
970: value = next;
971: optional(WS);
972:
973:
974: next = getTokenAhead();
975: if (bQUOTING.get(next.kind))
976: {
977: hTag = next;
978: error("The value without opening quote is closed with '" +
979: next.getImage() + "'"
980: );
981: }
982: attrValue = value.getImage();
983: break;
984:
985: default :
986: break attributeReading;
987: }
988: attributes.addAttribute(name.getImage(), attrValue);
989: optional(WS);
990: }
991: else
992: {
993: noValueAttribute(element, name.getImage());
994: }
995: }
996: }
997:
998:
1003: protected String resolveNamedEntity(final String a_tag)
1004: {
1005:
1006: if (!a_tag.startsWith("&"))
1007: throw new AssertionError("Named entity " + a_tag +
1008: " must start witn '&'."
1009: );
1010:
1011: String tag = a_tag.substring(1);
1012:
1013: try
1014: {
1015: Entity entity = dtd.getEntity(tag);
1016: if (entity != null)
1017: return entity.getString();
1018:
1019: entity = dtd.getEntity(tag.toLowerCase());
1020:
1021: if (entity != null)
1022: {
1023: error("The name of this entity should be in lowercase", a_tag);
1024: return entity.getString();
1025: }
1026: }
1027: catch (IndexOutOfBoundsException ibx)
1028: {
1029:
1030: }
1031:
1032: error("Unknown named entity", a_tag);
1033: return a_tag;
1034: }
1035:
1036:
1041: protected char resolveNumericEntity(final String a_tag)
1042: {
1043:
1044: if (!a_tag.startsWith("&#"))
1045: throw new AssertionError("Numeric entity " + a_tag +
1046: " must start witn '&#'."
1047: );
1048:
1049: String tag = a_tag.substring(2);
1050:
1051: try
1052: {
1053:
1054: char cx = tag.charAt(0);
1055: if (cx == 'x' || cx == 'X')
1056:
1057: return (char) Integer.parseInt(tag.substring(1), 16);
1058:
1059: return (char) Integer.parseInt(tag);
1060: }
1061:
1062:
1063: catch (NumberFormatException nex)
1064: {
1065: }
1066: catch (IndexOutOfBoundsException ix)
1067: {
1068: }
1069:
1070: error("Invalid numeric entity", a_tag);
1071: return '?';
1072: }
1073:
1074:
1078: protected void restart()
1079: {
1080: documentTags.clear();
1081: titleHandled = false;
1082: titleOpen = false;
1083: buffer.setLength(0);
1084: title.setLength(0);
1085: validator.restart();
1086: }
1087:
1088:
1095: protected void startTag(TagElement tag)
1096: throws ChangedCharSetException
1097: {
1098: }
1099:
1100:
1106: private void _handleCompleteElement(TagElement tag)
1107: {
1108: _handleStartTag(tag);
1109:
1110:
1111: HTML.Tag h = tag.getHTMLTag();
1112: if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1113: {
1114: boolean tmp = titleOpen;
1115: titleOpen = false;
1116: _handleText();
1117: titleOpen = tmp;
1118: }
1119: else
1120: _handleText();
1121:
1122: _handleEndTag(tag);
1123: }
1124:
1125:
1131: private void _handleEmptyTag(TagElement tag)
1132: {
1133: try
1134: {
1135: validator.validateTag(tag, attributes);
1136: handleEmptyTag(tag);
1137: }
1138: catch (ChangedCharSetException ex)
1139: {
1140: error("Changed charset exception:", ex.getMessage());
1141: }
1142: }
1143:
1144:
1150: private void _handleEndTag(TagElement tag)
1151: {
1152: validator.closeTag(tag);
1153: _handleEndTag_remaining(tag);
1154: }
1155:
1156:
1161: void _handleEndTag_remaining(TagElement tag)
1162: {
1163: HTML.Tag h = tag.getHTMLTag();
1164:
1165: handleEndTag(tag);
1166: endTag(tag.fictional());
1167:
1168: if (h.isPreformatted())
1169: preformatted--;
1170: if (preformatted < 0)
1171: preformatted = 0;
1172:
1173: if (h == HTML.Tag.TITLE)
1174: {
1175: titleOpen = false;
1176: titleHandled = true;
1177:
1178: char[] a = new char[ title.length() ];
1179: title.getChars(0, a.length, a, 0);
1180: handleTitle(a);
1181: }
1182: }
1183:
1184:
1191: void _handleStartTag(TagElement tag)
1192: {
1193: validator.openTag(tag, attributes);
1194: startingTag(tag);
1195: handleStartTag(tag);
1196:
1197: HTML.Tag h = tag.getHTMLTag();
1198:
1199: if (h.isPreformatted())
1200: preformatted++;
1201:
1202: if (h == HTML.Tag.TITLE)
1203: {
1204: if (titleHandled)
1205: error("Repetetive <TITLE> tag");
1206: titleOpen = true;
1207: titleHandled = false;
1208: }
1209: }
1210:
1211:
1215: private void forciblyCloseTheTag()
1216: throws ParseException
1217: {
1218: int closeAt = 0;
1219: buffer.setLength(0);
1220:
1221: ahead:
1222: for (int i = 1; i < 100; i++)
1223: {
1224: t = getTokenAhead(i - 1);
1225: if (t.kind == EOF || t.kind == BEGIN)
1226: break ahead;
1227: if (t.kind == END)
1228: {
1229:
1230: closeAt = i;
1231: break ahead;
1232: }
1233: }
1234: if (closeAt > 0)
1235: {
1236: buffer.append("Ignoring '");
1237: for (int i = 1; i <= closeAt; i++)
1238: {
1239: t = getNextToken();
1240: append(t);
1241: }
1242: buffer.append('\'');
1243: error(buffer.toString());
1244: }
1245: }
1246:
1247:
1251: private void handleComment()
1252: {
1253: char[] a = new char[ buffer.length() ];
1254: buffer.getChars(0, a.length, a, 0);
1255: handleComment(a);
1256: }
1257:
1258: private TagElement makeTagElement(String name, boolean isSupposed)
1259: {
1260: Element e = (Element) dtd.elementHash.get(name.toLowerCase());
1261: if (e == null)
1262: {
1263: error("Unknown tag <" + name + ">");
1264: e = dtd.getElement(name);
1265: e.name = name.toUpperCase();
1266: e.index = -1;
1267: }
1268:
1269: if (!documentTags.contains(e.name))
1270: {
1271: markFirstTime(e);
1272: documentTags.add(e.name);
1273: }
1274:
1275: return makeTag(e, isSupposed);
1276: }
1277:
1278:
1284: private void readTillTokenE(int till)
1285: throws ParseException
1286: {
1287: buffer.setLength(0);
1288: read:
1289: while (true)
1290: {
1291: t = getNextToken();
1292: if (t.kind == Constants.ENTITY)
1293: {
1294: resolveAndAppendEntity(t);
1295: }
1296: else if (t.kind == EOF)
1297: {
1298: error("unexpected eof", t);
1299: break read;
1300: }
1301: else if (t.kind == till)
1302: break read;
1303: else if (t.kind == WS)
1304: {
1305:
1306: String s = t.getImage();
1307: char c;
1308: for (int i = 0; i < s.length(); i++)
1309: {
1310: c = s.charAt(i);
1311: if (c == '\r')
1312: buffer.append(' ');
1313: else if (c == '\n')
1314: ;
1315: else if (c == '\t')
1316: buffer.append(' ');
1317: else
1318: buffer.append(c);
1319: }
1320: }
1321: else
1322: append(t);
1323: }
1324: }
1325:
1326:
1330: private void resolveAndAppendEntity(Token entity)
1331: {
1332: switch (entity.category)
1333: {
1334: case ENTITY_NAMED :
1335: buffer.append(resolveNamedEntity(entity.getImage()));
1336: break;
1337:
1338: case ENTITY_NUMERIC :
1339: buffer.append(resolveNumericEntity(entity.getImage()));
1340: break;
1341:
1342: default :
1343: throw new AssertionError("Invalid entity category " +
1344: entity.category
1345: );
1346: }
1347: }
1348:
1349:
1357: private void restOfTag(boolean closing, Token name, Token start)
1358: throws ParseException
1359: {
1360: boolean end = false;
1361: Token next;
1362:
1363: optional(WS);
1364:
1365: readAttributes(name.getImage());
1366:
1367: optional(WS);
1368:
1369: next = getTokenAhead();
1370: if (next.kind == END)
1371: {
1372: mustBe(END);
1373: end = true;
1374: }
1375:
1376: hTag = new Token(start, next);
1377:
1378: attributes.setResolveParent(defaulter.getDefaultParameters(name.getImage()));
1379:
1380: if (!end)
1381: {
1382:
1383:
1384: if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1385: backupMode
1386: )
1387: {
1388: error("Errors in tag body and unknown tag name. " +
1389: "Treating the tag as a text."
1390: );
1391: reset();
1392:
1393: hTag = mustBe(BEGIN);
1394: buffer.setLength(0);
1395: buffer.append(hTag.getImage());
1396: CDATA(false);
1397: return;
1398: }
1399: else
1400: {
1401: error("Forcibly closing invalid parameter list");
1402: forciblyCloseTheTag();
1403: }
1404: }
1405:
1406: if (closing)
1407: {
1408: endTag(false);
1409: _handleEndTag(makeTagElement(name.getImage(), false));
1410: }
1411: else
1412: {
1413: TagElement te = makeTagElement(name.getImage(), false);
1414: if (te.getElement().type == DTDConstants.EMPTY)
1415: _handleEmptyTag(te);
1416: else
1417: _handleStartTag(te);
1418: }
1419: }
1420:
1421:
1427: private void startingTag(TagElement tag)
1428: {
1429: try
1430: {
1431: startTag(tag);
1432: }
1433: catch (ChangedCharSetException cax)
1434: {
1435: error("Invalid change of charset");
1436: }
1437: }
1438:
1439: private void ws_error()
1440: {
1441: error("Whitespace here is not permitted");
1442: }
1443: }