module Text.HTML.TagSoup.Specification(parse) where
import Text.HTML.TagSoup.Implementation
import Data.Char
white x = x `elem` "\t\n\f "
type Parser = S -> [Out]
parse :: String -> [Out]
parse = dat . state
dat :: Parser
dat S{..} = pos $ case hd of
'&' -> charReference tl
'<' -> tagOpen tl
_ | eof -> []
_ -> hd & dat tl
charReference s = charRef dat False Nothing s
tagOpen S{..} = case hd of
'!' -> markupDeclOpen tl
'/' -> closeTagOpen tl
_ | isAlpha hd -> Tag & hd & tagName False tl
'>' -> errSeen "<>" & '<' & '>' & dat tl
'?' -> neilXmlTagOpen tl
_ -> errSeen "<" & '<' & dat s
neilXmlTagOpen S{..} = pos $ case hd of
_ | isAlpha hd -> Tag & '?' & hd & tagName True tl
_ -> errSeen "<?" & '<' & '?' & dat s
neilXmlTagClose S{..} = pos $ case hd of
'>' -> TagEnd & dat tl
_ -> errSeen "?" & beforeAttName True s
neilTagEnd xml S{..}
| xml = pos $ errWant "?>" & TagEnd & dat s
| otherwise = pos $ TagEnd & dat s
closeTagOpen S{..} = case hd of
_ | isAlpha hd || hd `elem` "?!" -> TagShut & hd & tagName False tl
'>' -> errSeen "</>" & '<' & '/' & '>' & dat tl
_ | eof -> '<' & '/' & dat s
_ -> errWant "tag name" & bogusComment s
tagName xml S{..} = pos $ case hd of
_ | white hd -> beforeAttName xml tl
'/' -> selfClosingStartTag xml tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | isAlpha hd -> hd & tagName xml tl
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> hd & tagName xml tl
beforeAttName xml S{..} = pos $ case hd of
_ | white hd -> beforeAttName xml tl
'/' -> selfClosingStartTag xml tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | hd `elem` "\'\"" -> beforeAttValue xml s
_ | hd `elem` "\"'<=" -> errSeen [hd] & AttName & hd & attName xml tl
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> AttName & hd & attName xml tl
attName xml S{..} = pos $ case hd of
_ | white hd -> afterAttName xml tl
'/' -> selfClosingStartTag xml tl
'=' -> beforeAttValue xml tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | hd `elem` "\"'<" -> errSeen [hd] & def
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> def
where def = hd & attName xml tl
afterAttName xml S{..} = pos $ case hd of
_ | white hd -> afterAttName xml tl
'/' -> selfClosingStartTag xml tl
'=' -> beforeAttValue xml tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | hd `elem` "\"'" -> AttVal & beforeAttValue xml s
_ | hd `elem` "\"'<" -> errSeen [hd] & def
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> def
where def = AttName & hd & attName xml tl
beforeAttValue xml S{..} = pos $ case hd of
_ | white hd -> beforeAttValue xml tl
'\"' -> AttVal & attValueDQuoted xml tl
'&' -> AttVal & attValueUnquoted xml s
'\'' -> AttVal & attValueSQuoted xml tl
'>' -> errSeen "=" & neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | hd `elem` "<=" -> errSeen [hd] & def
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> def
where def = AttVal & hd & attValueUnquoted xml tl
attValueDQuoted xml S{..} = pos $ case hd of
'\"' -> afterAttValueQuoted xml tl
'&' -> charRefAttValue (attValueDQuoted xml) (Just '\"') tl
_ | eof -> errWant "\"" & dat s
_ -> hd & attValueDQuoted xml tl
attValueSQuoted xml S{..} = pos $ case hd of
'\'' -> afterAttValueQuoted xml tl
'&' -> charRefAttValue (attValueSQuoted xml) (Just '\'') tl
_ | eof -> errWant "\'" & dat s
_ -> hd & attValueSQuoted xml tl
attValueUnquoted xml S{..} = pos $ case hd of
_ | white hd -> beforeAttName xml tl
'&' -> charRefAttValue (attValueUnquoted xml) Nothing tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | hd `elem` "\"'<=" -> errSeen [hd] & def
_ | eof -> errWant (if xml then "?>" else ">") & dat s
_ -> def
where def = hd & attValueUnquoted xml tl
charRefAttValue :: Parser -> Maybe Char -> Parser
charRefAttValue resume c s = charRef resume True c s
afterAttValueQuoted xml S{..} = pos $ case hd of
_ | white hd -> beforeAttName xml tl
'/' -> selfClosingStartTag xml tl
'>' -> neilTagEnd xml tl
'?' | xml -> neilXmlTagClose tl
_ | eof -> dat s
_ -> errSeen [hd] & beforeAttName xml s
selfClosingStartTag xml S{..} = pos $ case hd of
_ | xml -> errSeen "/" & beforeAttName xml s
'>' -> TagEndClose & dat tl
_ | eof -> errWant ">" & dat s
_ -> errSeen "/" & beforeAttName xml s
bogusComment S{..} = Comment & bogusComment1 s
bogusComment1 S{..} = pos $ case hd of
'>' -> CommentEnd & dat tl
_ | eof -> CommentEnd & dat s
_ -> hd & bogusComment1 tl
markupDeclOpen S{..} = pos $ case hd of
_ | Just s <- next "--" -> Comment & commentStart s
_ | isAlpha hd -> Tag & '!' & hd & tagName False tl
_ | Just s <- next "[CDATA[" -> cdataSection s
_ -> errWant "tag name" & bogusComment s
commentStart S{..} = pos $ case hd of
'-' -> commentStartDash tl
'>' -> errSeen "<!-->" & CommentEnd & dat tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> hd & comment tl
commentStartDash S{..} = pos $ case hd of
'-' -> commentEnd tl
'>' -> errSeen "<!--->" & CommentEnd & dat tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> '-' & hd & comment tl
comment S{..} = pos $ case hd of
'-' -> commentEndDash tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> hd & comment tl
commentEndDash S{..} = pos $ case hd of
'-' -> commentEnd tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> '-' & hd & comment tl
commentEnd S{..} = pos $ case hd of
'>' -> CommentEnd & dat tl
'-' -> errWant "-->" & '-' & commentEnd tl
_ | white hd -> errSeen "--" & '-' & '-' & hd & commentEndSpace tl
'!' -> errSeen "!" & commentEndBang tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> errSeen "--" & '-' & '-' & hd & comment tl
commentEndBang S{..} = pos $ case hd of
'>' -> CommentEnd & dat tl
'-' -> '-' & '-' & '!' & commentEndDash tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> '-' & '-' & '!' & hd & comment tl
commentEndSpace S{..} = pos $ case hd of
'>' -> CommentEnd & dat tl
'-' -> commentEndDash tl
_ | white hd -> hd & commentEndSpace tl
_ | eof -> errWant "-->" & CommentEnd & dat s
_ -> hd & comment tl
cdataSection S{..} = pos $ case hd of
_ | Just s <- next "]]>" -> dat s
_ | eof -> dat s
_ | otherwise -> hd & cdataSection tl
charRef :: Parser -> Bool -> Maybe Char -> S -> [Out]
charRef resume att end S{..} = pos $ case hd of
_ | eof || hd `elem` "\t\n\f <&" || maybe False (== hd) end -> '&' & resume s
'#' -> charRefNum resume s tl
_ -> charRefAlpha resume att s
charRefNum resume o S{..} = pos $ case hd of
_ | hd `elem` "xX" -> charRefNum2 resume o True tl
_ -> charRefNum2 resume o False s
charRefNum2 resume o hex S{..} = pos $ case hd of
_ | hexChar hex hd -> (if hex then EntityHex else EntityNum) & hd & charRefNum3 resume hex tl
_ -> errSeen "&" & '&' & resume o
charRefNum3 resume hex S{..} = pos $ case hd of
_ | hexChar hex hd -> hd & charRefNum3 resume hex tl
';' -> EntityEnd True & resume tl
_ -> EntityEnd False & errWant ";" & resume s
charRefAlpha resume att S{..} = pos $ case hd of
_ | isAlpha hd -> EntityName & hd & charRefAlpha2 resume att tl
_ -> errSeen "&" & '&' & resume s
charRefAlpha2 resume att S{..} = pos $ case hd of
_ | alphaChar hd -> hd & charRefAlpha2 resume att tl
';' -> EntityEnd True & resume tl
_ | att -> EntityEnd False & resume s
_ -> EntityEnd False & errWant ";" & resume s
alphaChar x = isAlphaNum x || x `elem` ":-_"
hexChar False x = isDigit x
hexChar True x = isDigit x || (x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F')