-- | -- Module : HsIndex.Parser -- Copyright : Jean-Luc JOULIN 2018-2020 -- License : General Public Licence (GPLv3) -- Maintainer : Jean-Luc JOULIN -- Stability : alpha -- Portability : portable -- The parsing functions for the hsindex program. module HsIndex.Parser ( -- * Parsing the index file -- * Parsing the style file -- * Parsing the custom language definition file -- -- ** Description of a custom file -- -- A custom language can be defined with a specific file. -- This file must respect the following format and contain in any order : -- -- 1. A mandatory ordered list of letters describing the alphabet. -- -- > LETTERS -- > abcdefghijkl -- -- 2. An optional ordered list of numbers. -- -- > NUMBERS -- > 0123456789 -- -- 3. An optional ordered list of symbols. -- -- > SYMBOLS -- > 0123456789 -- -- 4. An optional substitution list. This list describe the character substitutions -- to perform before sorting the words. -- -- > SUBSTITUTIONS -- > œ->oe -- > à->a -- > ê->e -- -- ** example of custom file -- -- styleBasic , styleDoubleHeading , parseStyleFile , parseLanguageFile , parseIndexFile , emptyDef ) where import Data.Char import Data.Functor.Identity import Data.Functor import HsIndex.CharLists.French import HsIndex.CharLists.Russian import HsIndex.CharLists.SubsIEC import HsIndex.CharLists.Symbols import HsIndex.Functions ( literalNewLine -- ~ , upperLower -- ~ , replaUpperLower , sepArobase ) import HsIndex.Types import qualified Data.Text as T import Text.Parsec import Text.Parsec.Perm import Text.Parsec.Text -- | The default 'IndexStyle' applied to an index. -- -- This default style have : -- -- * Basic preamble and postamble -- -- * Uppercase Layer 0 heading -- -- * No Layer 1 heading -- styleBasic = IndexStyle { idxPreamble = T.pack "\\begin{theindex}\n" , idxPostamble = T.pack "\n\n\\end{theindex}\n" , idxHeadingFlag0 = UpperCase , idxHeadingFlag1 = None , idxHeadingPreL0 = T.pack "{\\vspace{1.5cm}\\huge{\\textbf{" , idxHeadingSufL0 = T.pack "}}\\hfill}\\nopagebreak\n" , idxHeadingPreL1 = T.pack "" -- TODO Remplacer par empty , idxHeadingSufL1 = T.pack "" , idxSymhead = T.pack "Symbols" , idxNumhead = T.pack "Numbers" , idxGroupSkip0 = T.pack "\n \\indexspace\n" -- "\n\n \\indexspace\n" , idxGroupSkip1 = T.pack "" , idxItem0 = T.pack "\n \\item " , idxItem1 = T.pack "\n \\subitem " , idxItem2 = T.pack "\n \\subsubitem " , idxItem01 = T.pack "\n \\subitem " , idxItem12 = T.pack "\n \\subsubitem " , idxDelim0 = T.pack ", " -- \\hfill , idxDelim1 = T.pack ", " , idxDelim2 = T.pack ", " , idxDelimn = T.pack ", " , idxDelimr = T.pack "--" , idxEncapPre = T.pack "{" , idxEncapSuf = T.pack "}" } -- | Another 'IndexStyle' applied to an index. -- -- This default style have : -- -- * Basic preamble and postamble -- -- * Uppercase Layer 0 headings -- -- * Uppercase Layer 1 headings -- styleDoubleHeading = IndexStyle { idxPreamble = T.pack "\\begin{theindex}\n" , idxPostamble = T.pack "\n\n\\end{theindex}\n" , idxHeadingFlag0 = UpperCase , idxHeadingFlag1 = UpperCase , idxHeadingPreL0 = T.pack "{\\vspace{1.5cm}\\huge{\\textbf{" , idxHeadingSufL0 = T.pack "}}\\hfill}\\nopagebreak\n\n" , idxHeadingPreL1 = T.pack "\n{\\vspace{0.5cm}\\large{\\textbf{" , idxHeadingSufL1 = T.pack "}}\\hfill}\\nopagebreak" , idxSymhead = T.pack "Symbols" , idxNumhead = T.pack "Numbers" , idxGroupSkip0 = T.pack "\n\n \\indexspace\n" , idxGroupSkip1 = T.pack "\n\n \\indexspace\n" , idxItem0 = T.pack "\n \\item " , idxItem1 = T.pack "\n \\subitem " , idxItem2 = T.pack "\n \\subsubitem " , idxItem01 = T.pack "\n \\subitem " , idxItem12 = T.pack "\n \\subsubitem " , idxDelim0 = T.pack ", " -- \\hfill , idxDelim1 = T.pack ", " , idxDelim2 = T.pack ", " , idxDelimn = T.pack ", " , idxDelimr = T.pack "--" , idxEncapPre = T.pack "{" , idxEncapSuf = T.pack "}" } -- | Parse a style file -- -- A style file can contain several optional keywords definition to set the design -- of an index. -- -- Keywords can be : -- -- [preamble] To set the beginning of the index. -- -- -- [postamble] To set the end of the index. -- parseStyleFile :: IndexStyle -- ^ The default 'Style' to use. -> ParsecT String () Identity IndexStyle -- ^ The new 'Style' parsed. parseStyleFile sty = do emptyLines -- Possibles emptylines at the beginning of the file sty <- permute ( IndexStyle -- all possible permutations <$?> (idxPreamble sty , try $ parseStyleDef "preamble") -- Parse the preamble <|?> (idxPostamble sty , try $ parseStyleDef "postamble") <|?> (idxHeadingFlag0 sty, try $ parseStyleDefHead "headings_flag") <|?> (idxHeadingFlag1 sty, try $ parseStyleDefHead "headings_flag1") <|?> (idxHeadingPreL0 sty, try $ parseStyleDef "heading_prefix") <|?> (idxHeadingSufL0 sty, try $ parseStyleDef "heading_suffix") <|?> (idxHeadingPreL1 sty, try $ parseStyleDef "heading_prefix1") <|?> (idxHeadingSufL1 sty, try $ parseStyleDef "heading_suffix1") <|?> (idxSymhead sty , try $ parseStyleDef "symhead_positive") <|?> (idxNumhead sty , try $ parseStyleDef "numhead_positive") <|?> (idxGroupSkip0 sty , try $ parseStyleDef "group_skip") <|?> (idxGroupSkip1 sty , try $ parseStyleDef "group_skip1") <|?> (idxItem0 sty , try $ parseStyleDef "item_0") <|?> (idxItem1 sty , try $ parseStyleDef "item_1") <|?> (idxItem2 sty , try $ parseStyleDef "item_2") <|?> (idxItem01 sty , try $ parseStyleDef "item_01") <|?> (idxItem12 sty , try $ parseStyleDef "item_12") <|?> (idxDelim0 sty , try $ parseStyleDef "delim_0") <|?> (idxDelim1 sty , try $ parseStyleDef "delim_1") <|?> (idxDelim2 sty , try $ parseStyleDef "delim_2") <|?> (idxDelimn sty , try $ parseStyleDef "delim_n") <|?> (idxDelimr sty , try $ parseStyleDef "delim_r") <|?> (idxEncapPre sty , try $ parseStyleDef "encap_infix") <|?> (idxEncapSuf sty , try $ parseStyleDef "encap_suffix") ) eof -- the end of file return sty -- | Parse many empty lines. emptyLines = many emptyLine -- | Parse an empty line. emptyLine = do many (oneOf " \t") -- possibly some spaces and tabulations. endOfLineP -- The end of line -- | Parse a style definition -- -- > item_0 "my style definition" -- parseStyleDef :: String -- ^ The name of the style -> ParsecT String () Identity T.Text -- ^ The definition of the style. parseStyleDef str = do string str many1 (char ' ') def <- between (char '"') (char '"') (many1 $ noneOf "\r\n\t\"") many (char ' ') endOfLineP emptyLines return (literalNewLine $ T.pack def) parseStyleDefHead str = try (parseStyleDefHeadNum str) <|> try (parseStyleDefHeadNone str) <|> try (parseStyleDefHeadUpper str) <|> (parseStyleDefHeadLower str) parseStyleDefHeadNone :: String -> ParsecT String () Identity Heading parseStyleDefHeadNone str = do string str many1 (char ' ') s <- string "None" many (char ' ') endOfLineP emptyLines return None parseStyleDefHeadUpper :: String -> ParsecT String () Identity Heading parseStyleDefHeadUpper str = do string str many1 (char ' ') s <- string "Upper" many (char ' ') endOfLineP emptyLines return UpperCase parseStyleDefHeadLower :: String -> ParsecT String () Identity Heading parseStyleDefHeadLower str = do string str many1 (char ' ') s <- string "Lower" many (char ' ') endOfLineP emptyLines return LowerCase parseStyleDefHeadNum :: String -> ParsecT String () Identity Heading parseStyleDefHeadNum str = do string str many1 (char ' ') s <- option ' ' (char '-') h <- many1 digit many (char ' ') endOfLineP emptyLines return (val2Heading (read (s : h))) val2Heading 0 = None val2Heading n = if n > 0 then UpperCase else LowerCase -- | Try to parse a IeC LaTeX substitution. -- Return the associated character if succeed. lstParseIeC lst = try $ do string "\\IeC" many (char ' ') choice $ map (\(s, r) -> (try $ do braces $ do many (char ' ') char '\\' string s many (char ' ') return r )) lst -- | Parse a number parseNumber :: ParsecT String () Identity Char parseNumber = try digit -- | Parse a symbol parseSymbol :: ParsecT String () Identity Char parseSymbol = try (oneOf allowedSymb) -- | Parse an hyphen character parseHyph :: ParsecT String () Identity Char parseHyph = try (oneOf lstHyph) parseAnything :: ParsecT String () Identity Char parseAnything = try (noneOf forbiddenSymb) braces = between (char '{') (char '}') braces' = between (char '{') (try $ do char '}';lookAhead (char '{' )) -- | Parse a single entry command from "imakeidx" LaTeX package. parseIDX :: ParsecT String () Identity Char -> ParsecT String () Identity IndexItem parseIDX pars = do string "\\indexentry" many (char ' ') ((itm,itmE),com) <- braces' (do itm <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) com <- option "" $ try $do char '|' optional (char '(') many (notFollowedBy (do char '}';char '{') >> parseAnything) return (itm,com) ) many (char ' ') n <- braces (many1 digit) return (IndexItem (T.pack itm) (Letters, T.pack itmE) (T.pack com) [read n] []) -- | Parse a entry command containing a subentry from "imakeidx" LaTeX package. parseIDXSub :: ParsecT String () Identity Char -> ParsecT String () Identity IndexItem parseIDXSub pars = do string "\\indexentry" many (char ' ') ((itm,itmE), (sub,subE),com) <- braces' (do itm <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) char '!' sub <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) -- ~ string "|hyperpage" com <- option "" $ try $do char '|' optional (char '(') many (notFollowedBy (do char '}';char '{') >> parseAnything) return (itm, sub,com) ) many (char ' ') n <- braces (many1 digit) return (IndexItem (T.pack itm) (Letters, T.pack itmE) T.empty [] [IndexSubItem (T.pack sub) (Letters, T.pack subE) (T.pack com) [read n] []]) -- | Parse a entry command containing a subsubentry from "imakeidx" LaTeX package. parseIDXSubSub :: ParsecT String () Identity Char -> ParsecT String () Identity IndexItem parseIDXSubSub pars = do string "\\indexentry" many (char ' ') ((itm,itmE), (sub,subE), (ssub,ssubE),com) <- braces' (do itm <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) char '!' sub <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) char '!' ssub <- sepArobase <$> many1 (notFollowedBy (do char '}';char '{') >> pars) -- ~ string "|hyperpage" com <- option "" $ try $ do char '|' optional (char '(') many (notFollowedBy (do char '}';char '{') >> parseAnything) return (itm, sub, ssub,com) ) many (char ' ') n <- braces (many1 digit) return (IndexItem (T.pack itm) (Letters, T.pack itmE) T.empty [] [IndexSubItem (T.pack sub) (Letters, T.pack subE) T.empty [] [IndexSubSubItem (T.pack ssub) (Letters, T.pack ssubE) (T.pack com) [read n]]]) -- | Parse all possible forms of entry from "imakeidx" LaTeX package. parseIndexItem pars = try (parseIDXSubSub pars) <|> try (parseIDXSub pars) <|> parseIDX pars parseIndexFile :: ParsecT String () Identity [IndexItem] parseIndexFile = do emptyLines itms <- endBy (parseIndexItem parseCharL) endOfLineP emptyLines eof return itms -- | Parse a end of line in both UNIX and WINDOWS format. -- ~ endOfLineP :: ParsecT String () Identity String endOfLineP = try (string "\n") -- Fin de ligne Unix/Linux (LF : Line feed) <|> try (string "\r\n") -- Fin de ligne Windows (CRLF : Carriage return Line feed) -- | Standard parser for chars. -- -- Try to parse : -- -- 1. The specific char output from the "imakeidx" LaTeX package. -- -- 2. Numbers -- -- 3. parseCharL :: ParsecT String () Identity Char parseCharL = lstParseIeC lstLaTeXSubs <|> parseAnything -- | Parse a file containing the lists of chars defining a language. -- parseLanguageFile :: ParsecT String PermState Identity LangDef parseLanguageFile = do emptyLines -- Possibles emptylines at the beginning of the file -- ~ putState emptyPermState def <- permute ( (\a b c d -> LangDef a b c d []) -- all possible permutations <$$> try (parseCharDefLetters ) -- <$$> <|?> ([], try $ parseCharDefNumbers ) <|?> (Nothing, try $ parseCharDefSymbols ) <|?> ([], try $ parseSubstitutions ) ) eof -- the end of file stat <- getState return def{lstSecOrder=order stat} -- | Parse a char list definition. -- -- A char list is defined by: -- -- * A title -- -- * A list of chars describing the sorting order of letters of this language. -- parseCharDefLetters = do string "LETTERS" endOfLineP chrs <- many (noneOf "\n\t") endOfLineP emptyLines modifyState (\st -> st{order=order st++[Letters]}) return ( chrs) parseCharDefNumbers = do string "NUMBERS" endOfLineP chrs <- many (noneOf "\n\t") endOfLineP emptyLines modifyState (\st -> st{order=order st++[Numbers]}) return ( chrs) -- | Parse a char list definition. -- -- A char list is defined by: -- -- * A title -- -- * A list of chars describing the sorting order of letters of this language. -- parseCharDefSymbols = do string "SYMBOLS" endOfLineP chrs <- many (noneOf "\n\t") endOfLineP emptyLines modifyState (\st -> st{order=order st++[Symbols]}) return (Just ( chrs)) -- | Parse a list of substitutions -- -- A substitution give an equivalent string to a char. A list of substitution is defined by : -- -- * A title -- -- * A list of substitutions : -- -- > œ->oe -- > à->a -- > ê->e -- -- note: The arrow -> musn't be preceded or followed by spaces. -- -- A special char can be substituted by a space with the following substitution. -- -- > _-> -- > --> -- parseSubstitutions = do string "SUBSTITUTIONS" endOfLineP repl <- many1 parseSubstitution emptyLines return repl -- (replaUpperLower repl) parseSubstitution = do cha <- noneOf "\r\n\t" string "->" str <- many1 (noneOf "\r\n\t") -- many (char ' ') endOfLineP return (cha, str) -- | The empty list of chars. emptyDef = LangDef [] [] Nothing [] []