Q mode
x
1
/ utilities to quickly load a csv file - for more exhaustive analysis of the csv contents see csvguess.q / 2009.09.20 - updated to match latest csvguess.q / .csv.colhdrs[file] - return a list of colhdrs from file / info:.csv.info[file] - return
2
a table of information about the file / columns are: / c - column name; ci - column index; t - load type; mw - max width; / dchar - distinct characters in values; rule - rule that caught the type / maybe - needs checking, _could_ be say a
3
date, but perhaps just a float? / .csv.info0[file;onlycols] - like .csv.info except that it only analyses
4
<onlycols> / example: / info:.csv.info0[file;(.csv.colhdrs file)like"*price"] / info:.csv.infolike[file;"*price"] / show delete from info where t=" " / .csv.data[file;info] - use the info from .csv.info to read the data / .csv.data10[file;info] - like
5
.csv.data but only returns the first 10 rows / bulkload[file;info] - bulk loads file into table DATA (which must be already defined :: DATA:() ) / .csv.read[file]/read10[file] - for when you don't care about checking/tweaking the
6
<info>
7
before reading \d .csv DELIM:"," ZAPHDRS:0b / lowercase and remove _ from colhdrs (junk characters are always removed) WIDTHHDR:25000 / number of characters read to get the header READLINES:222 / number of lines read and used to guess
8
the types SYMMAXWIDTH:11 / character columns narrower than this are stored as symbols SYMMAXGR:10 / max symbol granularity% before we give up and keep as a * string FORCECHARWIDTH:30 / every field (of any type) with values this wide
9
or more is forced to character "*" DISCARDEMPTY:0b / completely ignore empty columns if true else set them to "C" CHUNKSIZE:50000000 / used in fs2 (modified .Q.fs) k)nameltrim:{$[~@x;.z.s'x;~(*x)in aA:.Q.a,.Q.A;(+/&\~x in aA)_x;x]}
10
k)fs2:{[f;s]((-7!s)>){[f;s;x]i:1+last@&0xa=r:1:(s;x;CHUNKSIZE);f@`\:i#r;x+i}[f;s]/0j} cleanhdrs:{{$[ZAPHDRS;lower x except"_";x]}x where x in DELIM,.Q.an} cancast:{nw:x$"";if[not x in"BXCS";nw:(min 0#;max 0#;::)@\:nw];$[not any nw
11
in x$(11&count y)#y;$[11
12
<count y;not any nw in x$y;1b];0b]} read:{[file]data[file;info[file]]} read10:{[file]data10[file;info[file]]} colhdrs:{[file] `$nameltrim DELIM vs cleanhdrs first read0(file;0;1+first where 0xa=read1(file;0;WIDTHHDR))}
13
data:{[file;info] (exec c from info where not t=" " )xcol(exec t from info;enlist DELIM)0:file} data10:{[file;info] data[;info](file;0;1+last 11#where 0xa=read1(file;0;15*WIDTHHDR))} info0:{[file;onlycols] colhdrs:`$nameltrim DELIM
14
vs cleanhdrs first head:read0(file;0;1+last where 0xa=read1(file;0;WIDTHHDR)); loadfmts:(count colhdrs)# "S";if[count onlycols;loadfmts[where not colhdrs in onlycols]: "C"]; breaks:where 0xa=read1(file;0;floor(10+READLINES)*WIDTHHDR%count
15
head); nas:count as:colhdrs xcol(loadfmts;enlist DELIM)0:(file;0;1+last((1+READLINES)&count breaks)#breaks); info:([]c:key flip as;v:value flip as);as:(); reserved:key`.q;reserved,:.Q.res;reserved,:`i; info:update res:c in reserved
16
from info; info:update ci:i,t: "?",ipa:0b,mdot:0,mw:0,rule:0,gr:0,ndv:0,maybe:0b,empty:0b,j10:0b,j12:0b from info; info:update ci:`s#ci from info; if[count onlycols;info:update t: " ",rule:10 from info where not c in onlycols]; info:update
17
sdv:{string(distinct x)except`}peach v from info; info:update ndv:count each sdv from info; info:update gr:floor 0.5+100*ndv%nas,mw:{max count each x}peach sdv from info where 0<ndv; info:update t: "*",rule:20 from info where mw>.csv.FORCECHARWIDTH; / long values info:update t:"C "[.csv.DISCARDEMPTY],rule:30,empty:1b from info where t="?",mw=0; / empty columns info:update dchar:{asc distinct raze x}peach sdv from info where t="?"; info:update mdot:{max sum
18
each"."=x}peach sdv from info where t="?",{"."in x}each dchar; info:update t:"n",rule:40 from info where t="?",{any x in"0123456789"}each dchar; / vaguely numeric.. info:update t:"I",rule:50,ipa:1b from info where t="n",mw within
19
7 15,mdot=3,{all x in".0123456789"}each dchar,.csv.cancast["I"]peach sdv; / ip-address info:update t:"J",rule:60 from info where t="n",mdot=0,{all x in"+-0123456789"}each dchar,.csv.cancast["J"]peach sdv; info:update t:"I",rule:70
20
from info where t="J",mw
21
<12,.csv.cancast[ "I"]peach sdv; info:update t: "H",rule:80 from info where t="I" ,mw<7,.csv.cancast[ "H"]peach sdv; info:update t: "F",rule:90 from info where t="n" ,mdot<2,mw>1,.csv.cancast["F"]peach sdv; info:update t:"E",rule:100,maybe:1b from info where t="F",mw
22
<9; info:update t: "M",rule:110,maybe:1b from info where t in "nIHEF",mdot<2,mw within 4 7,.csv.cancast[ "M"]peach sdv; info:update t:
23
"D",rule:120,maybe:1b from info where t in "nI",mdot in 0 2,mw within 6 11,.csv.cancast[ "D"]peach sdv; info:update t: "V",rule:130,maybe:1b from info where t="I" ,mw in 5 6,7<count each dchar,{all x like
24
"*[0-9][0-5][0-9][0-5][0-9]"}peach sdv,.csv.cancast[ "V"]peach sdv; / 235959 12345 info:update t: "U",rule:140,maybe:1b from info where t="H" ,mw in 3 4,7<count each dchar,{all x like "*[0-9][0-5][0-9]"}peach sdv,.csv.cancast[ "U"]peach sdv; /2359 info:update
25
t: "U",rule:150,maybe:0b from info where t="n" ,mw in 4 5,mdot=0,{all x like "*[0-9]:[0-5][0-9]"}peach sdv,.csv.cancast[ "U"]peach sdv; info:update t: "T",rule:160,maybe:0b from info where t="n" ,mw within 7 12,mdot<2,{all
26
x like "*[0-9]:[0-5][0-9]:[0-5][0-9]*"}peach sdv,.csv.cancast[ "T"]peach sdv; info:update t: "V",rule:170,maybe:0b from info where t="T" ,mw in 7 8,mdot=0,.csv.cancast[ "V"]peach sdv; info:update t: "T",rule:180,maybe:1b from
27
info where t in "EF",mw within 7 10,mdot=1,{all x like "*[0-9][0-5][0-9][0-5][0-9].*"}peach sdv,.csv.cancast[ "T"]peach sdv; info:update t: "Z",rule:190,maybe:0b from info where t="n" ,mw within 11 24,mdot<4,.csv.cancast[
28
"Z"]peach sdv; info:update t: "P",rule:200,maybe:1b from info where t="n" ,mw within 12 29,mdot<4,{all x like "[12]*"}peach sdv,.csv.cancast[ "P"]peach sdv; info:update t: "N",rule:210,maybe:1b from info where t="n" ,mw within
29
3 28,mdot=1,.csv.cancast[ "N"]peach sdv; info:update t: "?",rule:220,maybe:0b from info where t="n" ; / reset remaining maybe numeric info:update t: "C",rule:230,maybe:0b from info where t="?" ,mw=1; / char info:update t:
30
"B",rule:240,maybe:0b from info where t in "HC",mw=1,mdot=0,{$[all x in "01tTfFyYnN";(any "0fFnN"in x)and any "1tTyY"in x;0b]}each dchar; / boolean info:update t: "B",rule:250,maybe:1b from info where t in "HC",mw=1,mdot=0,{all
31
x in "01tTfFyYnN"}each dchar; / boolean info:update t: "X",rule:260,maybe:0b from info where t="?" ,mw=2,{$[all x in "0123456789abcdefABCDEF";(any .Q.n in x)and any "abcdefABCDEF"in x;0b]}each dchar; /hex info:update t: "S",rule:270,maybe:1b
32
from info where t="?" ,mw<.csv.SYMMAXWIDTH,mw>1,gr
33
<.csv.SYMMAXGR; / symbols (max width permitting) info:update t: "*",rule:280,maybe:0b from info where t="?" ; / the rest as strings / flag those S/* columns which could be encoded to integers (.Q.j10/x10/j12/x12) to avoid
34
symbols info:update j12:1b from info where t in "S*",mw<13,{all x in .Q.nA}each dchar; info:update j10:1b from info where t in "S*",mw<11,{all x in .Q.b6}each dchar; select c,ci,t,maybe,empty,res,j10,j12,ipa,mw,mdot,rule,gr,ndv,dchar
35
from info} info:info0[;()] / by default don 't restrict columns
36
infolike:{[file;pattern] info0[file;{x where x like y}[lower colhdrs[file];pattern]]} / .csv.infolike[file;"*time"]
37
38
\d .
39
/ DATA:()
40
bulkload:{[file;info]
41
if[not`DATA in system"v";'`DATA.not.defined]; if[count DATA; '`DATA.not.empty];
42
loadhdrs:exec c from info where not t=" ";loadfmts:exec t from info;
43
.csv.fs2[{[file;loadhdrs;loadfmts] `DATA insert $[count DATA;flip loadhdrs!(loadfmts;.csv.DELIM)0:file;loadhdrs xcol(loadfmts;enlist .csv.DELIM)0:file]}[file;loadhdrs;loadfmts]];
44
count DATA}
45
@[.:;"\\l csvutil.custom.q";::]; / save your custom settings in csvutil.custom.q to override those set at the beginning of the file
46
MIME type defined: text/x-q
.