I've been exploring the Stack Overflow data dumps and thus far taking advantage of the friendly XML and “parsing” with regular expressions. My attempts with various Haskell XML libraries to find the first post in document-order by a particular user all ran into nasty thrashing.
TagSoup
import Control.Monad
import Text.HTML.TagSoup
userid = "83805"
main = do
posts <- liftM parseTags (readFile "posts.xml")
print $ head $ map (fromAttrib "Id") $
filter (~== ("<row OwnerUserId=" ++ userid ++ ">"))
posts
hxt
import Text.XML.HXT.Arrow
import Text.XML.HXT.XPath
userid = "83805"
main = do
runX $ readDoc "posts.xml" >>> posts >>> arr head
where
readDoc = readDocument [ (a_tagsoup, v_1)
, (a_parse_xml, v_1)
, (a_remove_whitespace, v_1)
, (a_issue_warnings, v_0)
, (a_trace, v_1)
]
posts :: ArrowXml a => a XmlTree String
posts = getXPathTrees byUserId >>>
getAttrValue "Id"
where byUserId = "/posts/row/@OwnerUserId='" ++ userid ++ "'"
xml
import Control.Monad
import Control.Monad.Error
import Control.Monad.Trans.Maybe
import Data.Either
import Data.Maybe
import Text.XML.Light
userid = "83805"
main = do
[posts,votes] <- forM ["posts", "votes"] $
liftM parseXML . readFile . (++ ".xml")
let ps = elemNamed "posts" posts
putStrLn $ maybe "<not present>" show
$ filterElement (byUser userid) ps
elemNamed :: String -> [Content] -> Element
elemNamed name = head . filter ((==name).qName.elName) . onlyElems
byUser :: String -> Element -> Bool
byUser id e = maybe False (==id) (findAttr creator e)
where creator = QName "OwnerUserId" Nothing Nothing
Where did I go wrong? What is the proper way to process hefty XML documents with Haskell?