Skip to content

Instantly share code, notes, and snippets.

@ian-ross
Last active August 29, 2015 14:05
Show Gist options
  • Save ian-ross/ed3a21f6238719d4aaeb to your computer and use it in GitHub Desktop.
Save ian-ross/ed3a21f6238719d4aaeb to your computer and use it in GitHub Desktop.
Haskell data analysis: atmospheric non-diffusive flow data pre-processing
module Main where
import Prelude hiding (length, maximum, minimum, sum, map)
import Control.Applicative ((<$>))
import Control.Monad
import Data.List (foldl')
import qualified Data.Map as M
import Foreign.C
import qualified Data.Vector.Generic as GV
import qualified Data.Vector.Unboxed as VU
import Data.Array.Repa hiding ((++))
import Data.Array.Repa.Eval
import Data.Array.Repa.Repr.ForeignPtr (F)
import Data.Array.Repa.Repr.Unboxed (U)
import qualified Data.Vector.Storable as SV
import System.FilePath
import Data.IORef
import Data.NetCDF
import qualified Data.NetCDF.Vector as V
import qualified Data.NetCDF.Repa as R
import Debug.Trace
type SVRet a = IO (Either NcError (SV.Vector a))
type FArray3 a = Array F DIM3 a
type FArray2 a = Array F DIM2 a
type Array2 a = Array U DIM2 a
type RepaRet3 a = IO (Either NcError (FArray3 a))
type RepaRet2 a = IO (Either NcError (FArray2 a))
type RepaRet1 a = IO (Either NcError (Array F DIM1 a))
workdir :: FilePath
workdir = "/big/project-storage/haskell-data-analysis" </>
"atmos-non-diffusive/pre-processing"
main :: IO ()
main = do
-- Open subset data file for input.
Right innc <- openFile $ workdir </> "z500-subset.nc"
let Just nlat = ncDimLength <$> ncDim innc "lat"
Just nlon = ncDimLength <$> ncDim innc "lon"
Just ntime = ncDimLength <$> ncDim innc "time"
let (Just lonvar) = ncVar innc "lon"
(Just latvar) = ncVar innc "lat"
(Just timevar) = ncVar innc "time"
(Just z500var) = ncVar innc "z500"
Right lon <- get innc lonvar :: SVRet CFloat
Right lat <- get innc latvar :: SVRet CFloat
Right time <- get innc timevar :: SVRet CDouble
-- Each year has 181 days, so 72 x 15 x 181 = 195480 data values.
let ndays = 181
nyears = ntime `div` ndays
-- Use an Int array to accumulate values for calculating mean annual
-- cycle. Since the data is stored as short integer values, this is
-- enough to prevent overflow as we accumulate the 2014 - 1948 = 66
-- years of data.
let sh = Z :. ndays :. nlat :. nlon
slicecount = [ndays, nlat, nlon]
zs = take (product slicecount) (repeat 0)
init = fromList sh zs :: FArray3 Int
-- Computation to accumulate a single year's data.
let doone current y = do
-- Read one year's data.
let start = [(y - 1948) * ndays, 0, 0]
Right slice <- getA innc z500var start slicecount :: RepaRet3 CShort
return $! computeS $ current +^ (map fromIntegral slice)
-- Accumulate all data.
total <- foldM doone init [1948..2013]
-- Calculate the final mean annual cycle.
let mean = computeS $
map (fromIntegral . (`div` nyears)) total :: FArray3 CShort
-- Calculate 31-day running mean of mean annual cycle.
let delta = 15
smoothdays = ndays - 2 * delta
smoothmean = runmean mean delta
-- Set up output file.
let outlatdim = NcDim "lat" nlat False
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar)
outlondim = NcDim "lon" nlon False
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar)
outtimedim = NcDim "time" 0 True
outtimevar = NcVar "time" NcDouble [outtimedim] (ncVarAttrs timevar)
outdaydim = NcDim "day" smoothdays False
outdayvar = NcVar "day" NcInt [outdaydim] M.empty
outz500attrs = foldl' (flip M.delete) (ncVarAttrs z500var)
["scale_factor", "add_offset"]
outz500var = NcVar "z500" NcShort
[outtimedim, outlatdim, outlondim] outz500attrs
outmeanvar = NcVar "mean" NcShort [outdaydim, outlatdim, outlondim]
(ncVarAttrs z500var)
outncinfo =
emptyNcInfo (workdir </> "z500-anomalies.nc") #
addNcDim outlatdim # addNcDim outlondim #
addNcDim outtimedim # addNcDim outdaydim #
addNcVar outlatvar # addNcVar outlonvar #
addNcVar outtimevar # addNcVar outdayvar #
addNcVar outz500var # addNcVar outmeanvar
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $
\outnc -> do
-- Write fixed variable values.
put outnc outlatvar lat
put outnc outlonvar lon
put outnc outdayvar (SV.enumFromN 1 smoothdays :: SV.Vector CInt)
put outnc outmeanvar smoothmean
-- Process one input year at a time, accumulating all relevant
-- time-steps into a single output file.
itoutref <- newIORef 0 :: IO (IORef Int)
let count = [1, nlat, nlon]
forM_ [0..ntime-1] $ \it -> do
-- If "it" is within the 151 day winter block...
when (it `mod` 181 >= 15 && it `mod` 181 <= 165) $ do
-- Read time slice.
Right sli <- getA innc z500var [it, 0, 0] count :: RepaRet2 CShort
-- Calculate anomalies and write out.
let isl = it `mod` 181 - 15
sl = Z :. isl :. All :. All
anom = computeS $ sli -^ (slice smoothmean sl) :: FArray2 CShort
itout <- readIORef itoutref
putStrLn $ show it ++ " -> " ++ show itout
modifyIORef itoutref (+1)
putA outnc outtimevar [itout] [1] (SV.slice it 1 time)
putA outnc outz500var [itout, 0, 0] count anom
return ()
-- Calculate running mean of 3-dimensional array stored in (time, y,
-- x) coordinate order, from time positions (i - d) to (i + d) for
-- each valid time index i.
runmean :: FArray3 CShort -> Int -> FArray3 CShort
runmean x d = computeS $ traverse x (const smshape) doone
where (Z :. nz :. ny :. nx) = extent x
-- ^ Extent of input array.
smshape = Z :. nz - 2 * d :. ny :. nx
-- ^ Extent of smoothed output array: spatial dimensions are
-- the same as the input, and just the time dimension is
-- reduced by the averaging.
sz = Z :. (2 * d + 1) :. 1 :. 1
-- ^ Slice extent for averaging.
len = fromIntegral (2 * d + 1) :: Double
-- ^ Averaging length.
dem x = fromIntegral (truncate x :: Int)
-- ^ Type demotion from Double to CShort for output.
pro :: Array U DIM3 Double
pro = computeS $ map (\x -> fromIntegral (fromIntegral x :: Int)) x
-- ^ Type promotion of input from CShort to Double.
doone _ i = dem $ (sumAllS (extract i sz pro)) / len
-- ^ Calculate a single averaged value.
module Main where
import Prelude hiding (length, maximum, minimum, sum)
import Control.Applicative ((<$>))
import Control.Monad
import Data.List (foldl')
import qualified Data.Map as M
import Foreign.C
import Data.Vector.Generic hiding ((++), map, forM_, foldl')
import qualified Data.Array.Repa as Repa
import Data.Array.Repa.Repr.ForeignPtr (F)
import Data.Array.Repa.Repr.Unboxed (U)
import qualified Data.Vector.Storable as SV
import System.FilePath
import Data.IORef
import Data.NetCDF
import qualified Data.NetCDF.Vector as V
import qualified Data.NetCDF.Repa as R
type SVRet a = IO (Either NcError (SV.Vector a))
type FArray2 a = Repa.Array F Repa.DIM2 a
type Array2 a = Repa.Array U Repa.DIM2 a
type RepaRet2 a = IO (Either NcError (FArray2 a))
type RepaRet1 a = IO (Either NcError (Repa.Array F Repa.DIM1 a))
indir :: FilePath
indir = "/big/data/reanalysis/NCEP/pressure/hgt"
outdir :: FilePath
outdir = "/big/project-storage/haskell-data-analysis" </>
"atmos-non-diffusive/pre-processing"
main :: IO ()
main = do
-- Open first annual file to get metadata necessary for setting up
-- output file and for subsetting index calculations.
Right refnc <- openFile $ indir </> "hgt.1948.nc"
let Just nlat = ncDimLength <$> ncDim refnc "lat"
Just nlon = ncDimLength <$> ncDim refnc "lon"
Just nlev = ncDimLength <$> ncDim refnc "level"
let (Just lonvar) = ncVar refnc "lon"
(Just latvar) = ncVar refnc "lat"
(Just levvar) = ncVar refnc "level"
(Just timevar) = ncVar refnc "time"
(Just zvar) = ncVar refnc "hgt"
Right lon <- get refnc lonvar :: SVRet CFloat
Right lat <- get refnc latvar :: SVRet CFloat
Right lev <- get refnc levvar :: SVRet CFloat
-- Determine spatial subsetting index bounds.
let late = vectorIndex LT FromEnd lat 17.5
lats = vectorIndex GT FromStart lat 87.5
levi = vectorIndex GT FromStart lev 500.0
-- Determine appropriate time indexes for winter days: "winter" in
-- this context is 181 days long and starts on 1 November each year.
-- This means that we get the first 120 days of each year plus the
-- days from 1 November to the end of the year.
let inov1non = 305 - 1
-- ^ Index of 1 November in non-leap years.
wintertsnon = [0..119] ++ [inov1non..365-1]
-- ^ Indexes of all winter days for non-leap years.
inov1leap = 305 + 1 - 1
-- ^ Index of 1 November in leap years.
wintertsleap = [0..119] ++ [inov1leap..366-1]
-- ^ Indexes of all winter days for leap years.
winterts1948 = [inov1leap..366-1]
winterts2014 = [0..119]
-- ^ Indexes for winters in start and end years.
-- Set up output file.
let outlat = SV.fromList $ map (lat SV.!) [lats,lats+2..late]
outlon = SV.fromList $ map (lon SV.!) [0,2..nlon-1]
noutlat = SV.length outlat
noutlon = SV.length outlon
outlatdim = NcDim "lat" noutlat False
outlatvar = NcVar "lat" NcFloat [outlatdim] (ncVarAttrs latvar)
outlondim = NcDim "lon" noutlon False
outlonvar = NcVar "lon" NcFloat [outlondim] (ncVarAttrs lonvar)
outtimedim = NcDim "time" 0 True
outtimeattrs = foldl' (flip M.delete) (ncVarAttrs timevar)
["actual_range"]
outtimevar = NcVar "time" NcDouble [outtimedim] outtimeattrs
outz500attrs = foldl' (flip M.delete) (ncVarAttrs zvar)
["actual_range", "level_desc", "valid_range"]
outz500var = NcVar "z500" NcShort
[outtimedim, outlatdim, outlondim] outz500attrs
outncinfo =
emptyNcInfo (outdir </> "z500-subset.nc") #
addNcDim outlatdim # addNcDim outlondim # addNcDim outtimedim #
addNcVar outlatvar # addNcVar outlonvar # addNcVar outtimevar #
addNcVar outz500var
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $
\outnc -> do
-- Write coordinate variable values.
put outnc outlatvar outlat
put outnc outlonvar outlon
-- Set up output time step index.
outidxref <- newIORef 0
-- Process one input year at a time, accumulating all relevant
-- time-steps into a single output file.
forM_ [1948..2014] $ \y -> do
-- Open annual file.
syncFile outnc
putStrLn $ "Processing: " ++ show y
Right nc <- openFile $ indir </> ("hgt." ++ show y) <.> "nc"
-- Read metadata.
let Just ntime = ncDimLength <$> ncDim nc "time"
let (Just timevar) = ncVar nc "time"
(Just zvar) = ncVar nc "hgt"
Right time <- get nc timevar :: SVRet CDouble
-- Determine appropriate time indexes for winter days:
-- "winter" in this context is 181 days long and starts on 1
-- November each year. This means that we get the first 120
-- days of each year plus the days from 1 November to the end
-- of the year.
let leap = y `mod` 400 == 0 || (y `mod` 100 /= 0 && y `mod` 4 == 0)
-- ^ Is this a leap year?
winterts = case y of
1948 -> winterts1948
2014 -> winterts2014
_ -> if leap then wintertsleap else wintertsnon
-- ^ Indexes of all winter days.
-- Process one time-step at a time.
forM_ winterts $ \it -> do
-- Read a slice of a single time-step of data: Northern
-- Hemisphere (17.5-87.5 degrees), 5 degree resolution, 500
-- mb level only.
let start = [it, levi, lats, 0]
count = [1, 1, (late - lats) `div` 2 + 1, nlon `div` 2]
stride = [1, 1, 2, 2]
Right slice <- getS nc zvar start count stride :: RepaRet2 CShort
-- Write the time and Z500 data out.
outidx <- readIORef outidxref
modifyIORef outidxref (+1)
putA outnc outtimevar [outidx] [1] (SV.slice it 1 time)
putA outnc outz500var [outidx, 0, 0] [1, noutlat, noutlon] slice
data IndexStart = FromStart | FromEnd
vectorIndex :: (SV.Storable a, Ord a)
=> Ordering -> IndexStart -> SV.Vector a -> a -> Int
vectorIndex o s v val = case (go o, s) of
(Nothing, _) -> (-1)
(Just i, FromStart) -> i
(Just i, FromEnd) -> SV.length v - 1 - i
where go LT = SV.findIndex (>= val) vord
go GT = SV.findIndex (<= val) vord
vord = case s of
FromStart -> v
FromEnd -> SV.reverse v
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment