module Main where -- simple (and slow) implementation of Good-Turing smoothing -- with neighborhood smoothing and binning import Control.Monad import Data.FiniteMap import Data.List import Data.Maybe -- calculate unigram and bigram tables type UG = FiniteMap String Int type BG = FiniteMap (String,String) Int createTables :: [[String]] -> (UG, BG) createTables sents = (foldr buildTable emptyFM sents', foldr buildTable emptyFM (map (\s -> zip s (tail s)) sents')) where sents' = map (([""]++) . (++[""])) sents buildTable l t = foldr (\w t -> addToFM_C (+) t w 1) t l -- calcuate N_r table calculateN_r :: BG -> FiniteMap Int Int calculateN_r = foldFM (\_ r fm -> addToFM_C (+) fm r 1) emptyFM -- smooth the N_r table using binning and renormalize smoothN_r :: FiniteMap Int Int -> FiniteMap Int Double smoothN_r n_r = normalize (foldr addSmoothed emptyFM [low..high]) where low = foldFM (\r _ -> min r) 10000 n_r high = foldFM (\r _ -> max r) 0 n_r + 1 oldSum = fromIntegral $ foldFM (const (+)) 0 n_r smooth i = let fm_l = reverse $ eltsFM $ filterFM (\r _ -> r < i) n_r fm_g = eltsFM $ filterFM (\r _ -> r > i) n_r in average (take 1 fm_l ++ take 1 fm_g ++ maybeToList (lookupFM n_r i)) addSmoothed i fm = addToFM fm i (smooth i) normalize fm = mapFM (const (*s)) fm where s = oldSum / foldFM (const (+)) 0 fm -- compute r* given N, N_1 and the smoothed n_r table compute_rstar :: Int -> Int -> FiniteMap Int Double -> FiniteMap Int Double compute_rstar n n_1 n_r_table = removeMax $ addToFM (mapFM calc_rstar n_r_table) 0 r0 where r0 = fromIntegral n_1 / fromIntegral n calc_rstar r n_r = (fromIntegral r + 1) * lookupWithDefaultFM n_r_table 0 (r+1) / n_r removeMax fm = filterFM (\e _ -> e < mx) fm where mx = foldFM (\e _ -> max e) 0 fm -- do the binning; we strive to have k-many bins with equal -- numbers of elements computeBins :: Int -> UG -> BG -> [(UG,BG)] computeBins k ug bg = map mkUGBG bins where l = sortBy (\a b -> snd a `compare` snd b) [((x,y), p_x * p_y) | (x,y) <- keysFM bg , let p_x = lookupWithDefaultFM ug 0 x , let p_y = lookupWithDefaultFM ug 0 y ] n = ceiling (genericLength l / fromIntegral k) bins = splitOff n l mkUGBG bin = (listToFM $ concatMap (\ ((w1,w2),_) -> [(w1, lookupWithDefaultFM ug 0 w1), (w2, lookupWithDefaultFM ug 0 w2)]) bin, listToFM $ map (\ (b,_) -> (b, lookupWithDefaultFM bg 0 b)) bin) -- write an fm to a file writeData :: FiniteMap Int Double -> FilePath -> IO () writeData n_r_table fp = writeFile fp $ unlines $ map (\ (i,d) -> show i ++ " " ++ show d) $ fmToList n_r_table -- helper functions average x = fromIntegral (sum x) / genericLength x splitOff n [] = [] splitOff n xl = take n xl : splitOff n (drop n xl) -- example usage: main = do d <- (map words . lines) `liftM` getContents let (ug,bg) = createTables d let n_r_table = calculateN_r bg let n_r_smooth = smoothN_r n_r_table let r_star = compute_rstar (sizeFM ug * sizeFM ug) (lookupWithDefaultFM n_r_table 0 1) n_r_smooth -- show the basic n_r curve and show the basic r* table putStrLn "Writing r-Nr and r_star" writeData n_r_smooth "r-Nr" writeData r_star "r_star" -- compute bins let bins = computeBins 21 ug bg -- only look at first, last, and middle let ((firstU,firstB), (middleU,middleB), (lastU,lastB)) = (bins !! 0, bins !! 10, bins !! 20) -- for each bin, compute n_r_smooth and r_star let n_r_table_1 = calculateN_r firstB let n_r_smooth_1 = smoothN_r n_r_table_1 let r_star_1 = compute_rstar (sizeFM firstU * sizeFM firstU) (lookupWithDefaultFM n_r_table_1 0 1) n_r_smooth_1 putStrLn "Writing first.r-Nr and first.r_star" writeData n_r_smooth_1 "first.r-Nr" writeData r_star_1 "first.r_star" let n_r_table_2 = calculateN_r middleB let n_r_smooth_2 = smoothN_r n_r_table_2 let r_star_2 = compute_rstar (sizeFM middleU * sizeFM middleU) (lookupWithDefaultFM n_r_table_2 0 1) n_r_smooth_2 putStrLn "Writing middle.r-Nr and middle.r_star" writeData n_r_smooth_2 "middle.r-Nr" writeData r_star_2 "middle.r_star" let n_r_table_3 = calculateN_r lastB let n_r_smooth_3 = smoothN_r n_r_table_3 let r_star_3 = compute_rstar (sizeFM lastU * sizeFM lastU) (lookupWithDefaultFM n_r_table_3 0 1) n_r_smooth_3 putStrLn "Writing last.r-Nr and last.r_star" writeData n_r_smooth_3 "last.r-Nr" writeData r_star_3 "last.r_star"