data.lua

--- Reading and querying simple tabular data.
--
--    data.read 'test.txt'
--    ==> {{10,20},{2,5},{40,50},fieldnames={'x','y'},delim=','}
--
-- Provides a way of creating basic SQL-like queries.
--
--    require 'pl'
--    local d = data.read('xyz.txt')
--    local q = d:select('x,y,z where x > 3 and z < 2 sort by y')
--    for x,y,z in q do
--        print(x,y,z)
--    end
--
-- See the Guide
--
-- Dependencies: pl.utils, pl.array2d (fallback methods)
-- @module pl.data

local utils = require 'pl.utils'
local _DEBUG = rawget(_G,'_DEBUG')

local patterns,function_arg,usplit,array_tostring = utils.patterns,utils.function_arg,utils.split,utils.array_tostring
local append,concat = table.insert,table.concat
local gsub = string.gsub
local io = io
local _G,print,type,tonumber,ipairs,setmetatable,pcall,error = _G,print,type,tonumber,ipairs,setmetatable,pcall,error


local data = {}

local parse_select

local function count(s,chr)
    chr = utils.escape(chr)
    local _,cnt = s:gsub(chr,' ')
    return cnt
end

local function rstrip(s)
    return (s:gsub('%s+$',''))
end

local function strip (s)
    return (rstrip(s):gsub('^%s*',''))
end

-- this gives l the standard List metatable, so that if you
-- do choose to pull in pl.List, you can use its methods on such lists.
local function make_list(l)
    return setmetatable(l,utils.stdmt.List)
end

local function map(fun,t)
    local res = {}
    for i = 1,#t do
        res[i] = fun(t[i])
    end
    return res
end

local function split(line,delim,csv,n)
    local massage
    -- CSV fields may be double-quoted and may contain commas!
    if csv and line:match '"' then
        line = line:gsub('"([^"]+)"',function(str)
            local s,cnt = str:gsub(',','\001')
            if cnt > 0 then massage = true end
            return s
        end)
        if massage then
            massage = function(s) return (s:gsub('\001',',')) end
        end
    end
    local res = (usplit(line,delim,false,n))
    if csv then
        -- restore CSV commas-in-fields
        if massage then res = map(massage,res) end
        -- in CSV mode trailiing commas are significant!
        if line:match ',$' then append(res,'') end
    end
    return make_list(res)
end

local function find(t,v)
    for i = 1,#t do
        if v == t[i] then return i end
    end
end

local DataMT = {
    column_by_name = function(self,name)
        if type(name) == 'number' then
            name = '$'..name
        end
        local arr = {}
        for res in data.query(self,name) do
            append(arr,res)
        end
        return make_list(arr)
    end,

    copy_select = function(self,condn)
        condn = parse_select(condn,self)
        local iter = data.query(self,condn)
        local res = {}
        local row = make_list{iter()}
        while #row > 0 do
            append(res,row)
            row = make_list{iter()}
        end
        res.delim = self.delim
        return data.new(res,split(condn.fields,','))
    end,

    column_names = function(self)
        return self.fieldnames
    end,
}

local array2d

DataMT.__index = function(self,name)
    local f = DataMT[name]
    if f then return f end
    if not array2d then
        array2d = require 'pl.array2d'
    end
    return array2d[name]
end

--- return a particular column as a list of values (method).
-- @param name either name of column, or numerical index.
-- @function Data.column_by_name

--- return a query iterator on this data (method).
-- @string condn the query expression
-- @function Data.select
-- @see data.query

--- return a row iterator on this data (method).
-- @string condn the query expression
-- @function Data.select_row

--- return a new data object based on this query (method).
-- @string condn the query expression
-- @function Data.copy_select

--- return the field names of this data object (method).
-- @function Data.column_names

--- write out a row (method).
-- @param f file-like object
-- @function Data.write_row

--- write data out to file (method).
-- @param f file-like object
-- @function Data.write


-- [guessing delimiter] We check for comma, tab and spaces in that order.
-- [issue] any other delimiters to be checked?
local delims = {',','\t',' ',';'}

local function guess_delim (line)
    if line=='' then return ' ' end
    for _,delim in ipairs(delims) do
        if count(line,delim) > 0 then
            return delim == ' ' and '%s+' or delim
        end
    end
    return ' '
end

-- [file parameter] If it's a string, we try open as a filename. If nil, then
-- either stdin or stdout depending on the mode. Otherwise, check if this is
-- a file-like object (implements read or write depending)
local function open_file (f,mode)
    local opened, err
    local reading = mode == 'r'
    if type(f) == 'string' then
        if f == 'stdin'  then
            f = io.stdin
        elseif f == 'stdout'  then
            f = io.stdout
        else
            f,err = io.open(f,mode)
            if not f then return nil,err end
            opened = true
        end
    end
    if f and ((reading and not f.read) or (not reading and not f.write)) then
        return nil, "not a file-like object"
    end
    return f,nil,opened
end

local function all_n ()

end

--- read a delimited file in a Lua table.
-- By default, attempts to treat first line as separated list of fieldnames.
-- @param file a filename or a file-like object (default stdin)
-- @tab cnfg options table: can override delim (a string pattern), fieldnames (a list),
-- specify no_convert (default is to conversion), numfields (indices of columns known
-- to be numbers) and thousands_dot (thousands separator in Excel CSV is '.').
-- If csv is set then fields may be double-quoted and contain commas;
-- @return data object, or nil
-- @return error message. May be a file error, 'not a file-like object'
-- or a conversion error
function data.read(file,cnfg)
    local err,opened,count,line,csv
    local D = {}
    if not cnfg then cnfg = {} end
    local f,err,opened = open_file(file,'r')
    if not f then return nil, err end
    local thousands_dot = cnfg.thousands_dot

    -- note that using dot as the thousands separator (@thousands_dot)
    -- requires a special conversion function!
    local tonumber = tonumber
    local function try_number(x)
        if thousands_dot then x = x:gsub('%.(...)','%1') end
        local v = tonumber(x)
        if v == nil then return nil,"not a number" end
        return v
    end

    csv = cnfg.csv
    if csv then cnfg.delim = ',' end
    count = 1
    line = f:read()
    if not line then return nil, "empty file" end

    -- first question: what is the delimiter?
    D.delim = cnfg.delim and cnfg.delim or guess_delim(line)
    local delim = D.delim

    local conversion
    local numfields = {}
    local function append_conversion (idx,conv)
        conversion = conversion or {}
        append(numfields,idx)
        append(conversion,conv)
    end
    if cnfg.numfields then
        for _,n in ipairs(cnfg.numfields) do append_conversion(n,try_number) end
    end

    -- some space-delimited data starts with a space.  This should not be a column,
    -- although it certainly would be for comma-separated, etc.
    local stripper
    if delim == '%s+' and line:find(delim) == 1 then
        stripper = function(s)  return s:gsub('^%s+','') end
        line = stripper(line)
    end
    -- first line will usually be field names. Unless fieldnames are specified,
    -- we check if it contains purely numerical values for the case of reading
    -- plain data files.
    if not cnfg.fieldnames then
        local fields,nums
        fields = split(line,delim,csv)
        if not cnfg.convert then
            nums = map(tonumber,fields)
            if #nums == #fields then -- they're ALL numbers!
                append(D,nums) -- add the first converted row
                -- and specify conversions for subsequent rows
                for i = 1,#nums do append_conversion(i,try_number) end
            else -- we'll try to check numbers just now..
                nums = nil
            end
        else -- [explicit column conversions] (any deduced number conversions will be added)
            for idx,conv in pairs(cnfg.convert) do append_conversion(idx,conv) end
        end
        if nums == nil then
            cnfg.fieldnames = fields
        end
        line = f:read()
        count = count + 1
        if stripper then line = stripper(line) end
    elseif type(cnfg.fieldnames) == 'string' then
        cnfg.fieldnames = split(cnfg.fieldnames,delim,csv)
    end
    local nfields
    -- at this point, the column headers have been read in. If the first
    -- row consisted of numbers, it has already been added to the dataset.
    if cnfg.fieldnames then
        D.fieldnames = cnfg.fieldnames
        -- [collecting end field] If @last_field_collect then we'll
        -- only split as many fields as there are fieldnames
        if cnfg.last_field_collect then
            nfields = #D.fieldnames
        end
        -- [implicit column conversion] unless @no_convert, we need the numerical field indices
        -- of the first data row. These can also be specified explicitly by @numfields.
        if not cnfg.no_convert then
            local fields = split(line,D.delim,csv,nfields)
            for i = 1,#fields do
                if not find(numfields,i) and tonumber(fields[i]) then
                    append_conversion(i,try_number)
                end
            end
        end
    end
    -- keep going until finished
    while line do
        if not line:find ('^%s*$') then -- [blank lines] ignore them!
            if stripper then line = stripper(line) end
            local fields = split(line,delim,csv,nfields)
            if conversion then -- there were field conversions...
                for k = 1,#numfields do
                    local i,conv = numfields[k],conversion[k]
                    local val,err = conv(fields[i])
                    if val == nil then
                        return nil, err..": "..fields[i].." at line "..count
                    else
                        fields[i] = val
                    end
                end
            end
            append(D,fields)
        end
        line = f:read()
        count = count + 1
    end
    if opened then f:close() end
    if delim == '%s+' then D.delim = ' ' end
    if not D.fieldnames then D.fieldnames = {} end
    return data.new(D)
end

local function write_row (data,f,row,delim)
    data.temp = array_tostring(row,data.temp)
    f:write(concat(data.temp,delim),'\n')
end

function DataMT:write_row(f,row)
    write_row(self,f,row,self.delim)
end

--- write 2D data to a file.
-- Does not assume that the data has actually been
-- generated with new or read.
-- @param data 2D array
-- @param file filename or file-like object
-- @tparam[opt] {string} fieldnames list of fields (optional)
-- @string[opt='\t'] delim delimiter (default tab)
function data.write (data,file,fieldnames,delim)
    local f,err,opened = open_file(file,'w')
    if not f then return nil, err end
    if fieldnames and #fieldnames > 0 then
        f:write(concat(data.fieldnames,delim),'\n')
    end
    delim = delim or '\t'
    for i = 1,#data do
        write_row(data,f,data[i],delim)
    end
    if opened then f:close() end
end


function DataMT:write(file)
    data.write(self,file,self.fieldnames,self.delim)
end

local function massage_fieldnames (fields,copy)
    -- fieldnames must be valid Lua identifiers; ignore any surrounding padding
    -- but keep the original fieldnames...
    for i = 1,#fields do
        local f = strip(fields[i])
        copy[i] = f
        fields[i] = f:gsub('%W','_')
    end
end

--- create a new dataset from a table of rows.
-- Can specify the fieldnames, else the table must have a field called
-- 'fieldnames', which is either a string of delimiter-separated names,
-- or a table of names. <br>
-- If the table does not have a field called 'delim', then an attempt will be
-- made to guess it from the fieldnames string, defaults otherwise to tab.
-- @param d the table.
-- @tparam[opt] {string} fieldnames optional fieldnames
-- @return the table.
function data.new (d,fieldnames)
    d.fieldnames = d.fieldnames or fieldnames or ''
    if not d.delim and type(d.fieldnames) == 'string' then
        d.delim = guess_delim(d.fieldnames)
        d.fieldnames = split(d.fieldnames,d.delim)
    end
    d.fieldnames = make_list(d.fieldnames)
    d.original_fieldnames = {}
    massage_fieldnames(d.fieldnames,d.original_fieldnames)
    setmetatable(d,DataMT)
    -- a query with just the fieldname will return a sequence
    -- of values, which seq.copy turns into a table.
    return d
end

local sorted_query = [[
return function (t)
    local i = 0
    local v
    local ls = {}
    for i,v in ipairs(t) do
        if CONDITION then
            ls[#ls+1] = v
        end
    end
    table.sort(ls,function(v1,v2)
        return SORT_EXPR
    end)
    local n = #ls
    return function()
        i = i + 1
        v = ls[i]
        if i > n then return end
        return FIELDLIST
    end
end
]]

-- question: is this optimized case actually worth the extra code?
local simple_query = [[
return function (t)
    local n = #t
    local i = 0
    local v
    return function()
        repeat
            i = i + 1
            v = t[i]
        until i > n or CONDITION
        if i > n then return end
        return FIELDLIST
    end
end
]]

local function is_string (s)
    return type(s) == 'string'
end

local field_error

local function fieldnames_as_string (data)
    return concat(data.fieldnames,',')
end

local function massage_fields(data,f)
    local idx
    if f:find '^%d+$' then
        idx = tonumber(f)
    else
        idx = find(data.fieldnames,f)
    end
    if idx then
        return 'v['..idx..']'
    else
        field_error = f..' not found in '..fieldnames_as_string(data)
        return f
    end
end


local function process_select (data,parms)
    --- preparing fields ----
    local res,ret
    field_error = nil
    local fields = parms.fields
    local numfields = fields:find '%$'  or #data.fieldnames == 0
    if fields:find '^%s*%*%s*' then
        if not numfields then
            fields = fieldnames_as_string(data)
        else
            local ncol = #data[1]
            fields = {}
            for i = 1,ncol do append(fields,'$'..i) end
            fields = concat(fields,',')
        end
    end
    local idpat = patterns.IDEN
    if numfields then
        idpat = '%$(%d+)'
    else
        -- massage field names to replace non-identifier chars
        fields = rstrip(fields):gsub('[^,%w]','_')
    end
    local massage_fields = utils.bind1(massage_fields,data)
    ret = gsub(fields,idpat,massage_fields)
    if field_error then return nil,field_error end
    parms.fields = fields
    parms.proc_fields = ret
    parms.where = parms.where or  'true'
    if is_string(parms.where) then
        parms.where = gsub(parms.where,idpat,massage_fields)
        field_error = nil
    end
    return true
end


parse_select = function(s,data)
    local endp
    local parms = {}
    local w1,w2 = s:find('where ')
    local s1,s2 = s:find('sort by ')
    if w1 then -- where clause!
        endp = (s1 or 0)-1
        parms.where = s:sub(w2+1,endp)
    end
    if s1 then -- sort by clause (must be last!)
        parms.sort_by = s:sub(s2+1)
    end
    endp = (w1 or s1 or 0)-1
    parms.fields = s:sub(1,endp)
    local status,err = process_select(data,parms)
    if not status then return nil,err
    else return parms end
end

--- create a query iterator from a select string.
-- Select string has this format: <br>
-- FIELDLIST [ where LUA-CONDN [ sort by FIELD] ]<br>
-- FIELDLIST is a comma-separated list of valid fields, or '*'. <br> <br>
-- The condition can also be a table, with fields 'fields' (comma-sep string or
-- table), 'sort_by' (string) and 'where' (Lua expression string or function)
-- @param data table produced by read
-- @param condn select string or table
-- @param context a list of tables to be searched when resolving functions
-- @param return_row if true, wrap the results in a row table
-- @return an iterator over the specified fields, or nil
-- @return an error message
function data.query(data,condn,context,return_row)
    local err
    if is_string(condn) then
        condn,err = parse_select(condn,data)
        if not condn then return nil,err end
    elseif type(condn) == 'table' then
        if type(condn.fields) == 'table' then
            condn.fields = concat(condn.fields,',')
        end
        if not condn.proc_fields then
            local status,err = process_select(data,condn)
            if not status then return nil,err end
        end
    else
        return nil, "condition must be a string or a table"
    end
    local query, k
    if condn.sort_by then -- use sorted_query
        query = sorted_query
    else
        query = simple_query
    end
    local fields = condn.proc_fields or condn.fields
    if return_row then
        fields = '{'..fields..'}'
    end
    query,k = query:gsub('FIELDLIST',fields)
    if is_string(condn.where) then
        query = query:gsub('CONDITION',condn.where)
        condn.where = nil
    else
       query = query:gsub('CONDITION','_condn(v)')
       condn.where = function_arg(0,condn.where,'condition.where must be callable')
    end
    if condn.sort_by then
        local expr,sort_var,sort_dir
        local sort_by = condn.sort_by
        local i1,i2 = sort_by:find('%s+')
        if i1 then
            sort_var,sort_dir = sort_by:sub(1,i1-1),sort_by:sub(i2+1)
        else
            sort_var = sort_by
            sort_dir = 'asc'
        end
        if sort_var:match '^%$' then sort_var = sort_var:sub(2) end
        sort_var = massage_fields(data,sort_var)
        if field_error then return nil,field_error end
        if sort_dir == 'asc' then
            sort_dir = '<'
        else
            sort_dir = '>'
        end
        expr = ('%s %s %s'):format(sort_var:gsub('v','v1'),sort_dir,sort_var:gsub('v','v2'))
        query = query:gsub('SORT_EXPR',expr)
    end
    if condn.where then
        query = 'return function(_condn) '..query..' end'
    end
    if _DEBUG then print(query) end

    local fn,err = utils.load(query,'tmp')
    if not fn then return nil,err end
    fn = fn() -- get the function
    if condn.where then
        fn = fn(condn.where)
    end
    local qfun = fn(data)
    if context then
        -- [specifying context for condition] @context is a list of tables which are
        -- 'injected'into the condition's custom context
        append(context,_G)
        local lookup = {}
        utils.setfenv(qfun,lookup)
        setmetatable(lookup,{
            __index = function(tbl,key)
               -- _G.print(tbl,key)
                for k,t in ipairs(context) do
                    if t[key] then return t[key] end
                end
            end
        })
    end
    return qfun
end


DataMT.select = data.query
DataMT.select_row = function(d,condn,context)
    return data.query(d,condn,context,true)
end

--- Filter input using a query.
-- @string Q a query string
-- @param infile filename or file-like object
-- @param outfile filename or file-like object
-- @bool dont_fail true if you want to return an error, not just fail
function data.filter (Q,infile,outfile,dont_fail)
    local err
    local d = data.read(infile or 'stdin')
    local out = open_file(outfile or 'stdout')
    local iter,err = d:select(Q)
    local delim = d.delim
    if not iter then
        err = 'error: '..err
        if dont_fail then
            return nil,err
        else
            utils.quit(1,err)
        end
    end
    while true do
        local res = {iter()}
        if #res == 0 then break end
        out:write(concat(res,delim),'\n')
    end
end

return data
generated by LDoc 1.4.3 Last updated 2014-11-01 18:36:39