---------------------------------------------------------------------- -- | -- Module : RegExp -- Maintainer : Markus Forsberg -- Stability : (stability) -- Portability : (portability) -- -- -- A module with regular expressions containing variables. -- Interfaces to the Text.Regex module. ------------------------------------------------------------------------- ---- module RegExp ( Variable, RegExp, conc, or_r, star, plus, str, var, str_class, char, negated, letter, digit, upper, lower, Reg, matchReg, -- :: Reg -> String -> Maybe [(Variable, String)] matchRegScan, -- :: Reg -> String -> Maybe ([(Variable, String)],String) separate, -- :: Reg -> String -> String createReg, -- :: RegExp -> [(Variable,RegExp)] -> Reg variables, -- :: Reg -> [Variable] toRegex, -- :: RegExp -> [(Variable,RegExp)] -> (String,[Variable]) generateString ) where -- NOTE: Variables should only be used in concatenations and in very -- restricted settings of disjunction. -- x | y will yield an answer, where x is associated to the input string -- and y is associated to the empty string. The way to understand this -- output is that the variable binding actually refers to a sub-expression, -- and tells how much of the input string a particular sub-expression actually -- matched. I.e. it is not proper variable binding, and a front-end is -- needed to check the semantics. -- Disjunction that is ok: x+"a"|x+"b" and "a"+x+"b"|"c"+x+"d" import Text.Regex import System.IO.Unsafe tr s = unsafePerformIO $ do putStrLn s return s type Variable = String {- | The type of regular expressions. -} data RegExp = Conc RegExp RegExp | Uni RegExp RegExp | Star RegExp | Plus RegExp | S String | V Variable | Negated String | Digit | Letter | Upper | Lower | Any deriving(Show,Eq) conc :: RegExp -> RegExp -> RegExp conc r1 r2 = Conc r1 r2 or_r :: RegExp -> RegExp -> RegExp or_r r1 r2 = Uni r1 r2 star :: RegExp -> RegExp star = Star plus :: RegExp -> RegExp plus = Plus str :: [Char] -> RegExp str = S negated :: String -> RegExp negated = Negated str_class :: String -> RegExp str_class [] = str [] str_class s = foldr1 or_r $ map (str . (:[])) s var :: Variable -> RegExp var = V char :: RegExp char = Any lower :: RegExp lower = Lower upper :: RegExp upper = Upper digit :: RegExp digit = Digit letter :: RegExp letter = Letter {- | A regular expression is a Regex augmented with a set of variables. -} type Reg = (Regex,[(Variable,Int)]) {- | Repeated variables is interpreted as back references. I.e. repeated variables unifies. -} type BackReference = (Int,[(Variable,Int)]) {- | Precedence -} type Prec = Int {- | The variables of a Reg. -} variables :: Reg -> [Variable] variables (_,xs) = map fst xs {- | Match regular expression Reg with string String. If a successful match is performed, variables are bind to the output. -} matchReg :: Reg -> String -> Maybe [(Variable, String)] matchReg (r,vars) s = case matchRegexAll r s of Just ([],_,[],xs) -> Just $ extract_result vars xs _ -> Nothing matchRegScan :: Reg -> String -> Maybe ([(Variable, String)],String,String) matchRegScan (r,vars) s = case matchRegexAll r s of Just ([],matched,rest,xs) -> Just $ ((extract_result vars xs),matched,rest) _ -> Nothing replace :: Reg -> String -> String -> String replace (r,_) s rep = subRegex r s rep separate :: Reg -> String -> [String] separate (r,_) s = splitRegex r s extract_result :: [(String,Int)] -> [String] -> [(String,String)] extract_result [] _ = [] extract_result ((v,i):xs) res = (v,res!!(i-1)):extract_result xs res {- | All variables may be associated with a RegExp. If no such association exists, then it defaults to Kleene's star over any symbol. -} createReg :: RegExp -> [(Variable,RegExp)] -> Reg createReg r env = case toRegex r env of (s,xs) -> (mkRegex s,xs) {- | Translate a Regular expression with variables to a Regex string. -} toRegex :: RegExp -> [(Variable,RegExp)] -> (String,[(Variable,Int)]) toRegex r env = case f r (-1) (1,[]) of (s,(_,xs)) -> (s,reverse xs) where f :: RegExp -> Prec -> BackReference -> (String,BackReference) f r i vs@(n,vs') = case r of (Conc r1 r2) -> case f r1 1 vs of (s,xs) -> case f r2 1 xs of (s2,(n',vs)) -> (paren 1 i (concat [s,s2]),(n',vs)) (Uni r1 r2) -> case f r1 0 vs of (s,xs) -> case f r2 0 xs of (s2,(n',vs)) -> (paren 0 i (concat [s,"|",s2]),(n',vs)) (Star r1) -> case f r1 2 vs of (s,xs) -> (concat [s,"*"],xs) (Plus r1) -> case f r1 2 vs of (s,xs) -> (concat [s,"+"],xs) (S s) -> (whitespaces (process_string s),vs) (Negated s) -> ("[^" ++ (process_string s) ++ "]",vs) (Any) -> (".",vs) (Lower) -> ("[[:lower:]]",vs) (Upper) -> ("[[:upper:]]",vs) (Digit) -> ("[[:digit:]]",vs) (Letter) -> ("[[:lower:][:upper:]]",vs) (V var) -> case lookup var vs' of Just num -> ("\\" ++ show num,vs) Nothing -> case lookup var env of Nothing -> case f (Star Any) 0 vs of (s,(n',xs)) -> (concat ["(",s,")"],(n'+1,(var,n'):xs)) Just r -> case f r 0 vs of (s,(n',xs)) -> (concat ["(",s,")"],(n'+1,(var,n'):xs)) count :: Prec -> Prec -> Int -> Int count n1 n2 i | n1 >= n2 = i | otherwise = i+1 paren :: Prec -> Prec -> String -> String paren n1 n2 s | n1 >= n2 = s | otherwise = concat ["(", s ,")"] process_string :: String -> String process_string [] = "()" process_string xs = process xs where process [] = [] process (c:cs) = case c of '?' -> "\\?" ++ process cs '.' -> "\\." ++ process cs '^' -> "\\^" ++ process cs '$' -> "\\$" ++ process cs '+' -> "\\+" ++ process cs '*' -> "\\*" ++ process cs '{' -> "\\{" ++ process cs '\\' -> "\\\\" ++ process cs '|' -> "\\|" ++ process cs '[' -> "\\[" ++ process cs ']' -> "\\]" ++ process cs '(' -> "\\(" ++ process cs ')' -> "\\)" ++ process cs _ -> c:process cs whitespaces :: String -> String whitespaces = id {- whitespaces [] = [] whitespaces (c:cs) = case c of ' ' -> "[ ]" ++ whitespaces cs '\t' -> "[\t]" ++ whitespaces cs '\n' -> "[\n]" ++ whitespaces cs _ -> c:whitespaces cs -} ex = plus (or_r (str "a") (or_r (str "b") (str "c"))) generateString :: RegExp -> [(String,String)] -> String generateString r table = case r of (Conc r1 r2) -> generateString r1 table ++ generateString r2 table (S s) -> s (V var) -> case lookup var table of Just s -> s Nothing -> error $ "Internal error: found unknown variable \"" ++ var ++ "\" while generating string."