root > codebin > triplify.py

triplify.py

application/x-python, 1410 bytes (load raw)
# -*- coding: utf-8 -*-
import sys, os
import re, pprint
import ConfigParser, optparse
import csv

def triplify( row ):
        key = row[keyfield]
        for p in props:
                v = row[p]
                if not v or len(v)==0:
                        continue

                if splitby:
                        vv = splitby.split(v)
                        for v in vv:
                                writer.writerow( dict(zip(outfilds, [key, p, v.strip()])) )
                else:
                        writer.writerow( dict(zip(outfilds, [key, p, v])) )

parser = optparse.OptionParser()

parser.add_option("-s", "--split", dest="splitby", default = None,
                  help="Split multi-value fields at the given character(s)", )

parser.add_option("-k", "--key", dest="keyfield", default = None,
                  help="Use this field as unique key (subject id)", )

(options, args) = parser.parse_args()

splitby = options.splitby
keyfield = options.keyfield

if splitby:
        splitby = re.compile("[%s]" % splitby)

if len(args)>0:
        csvin = open(args[0])
else:
        csvin = sys.stdin

reader = csv.DictReader(csvin, delimiter='\t', quotechar='\\')

csvout = sys.stdout

props = args[1:]
       
if not keyfield:
        keyfield = fields[0]

if len(props) == 0:
        i = fields.index(keyfield)
        props = fields[:i] + fields[i+1:]

outfilds = [ keyfield, "property", "value" ]

writer = csv.DictWriter(csvout, outfilds, delimiter='\t', quotechar='\\')
writer.writerow(dict(zip(outfilds, outfilds)))

for row in reader:
        triplify(row)
               
if csvout and csvout != sys.stdout:
        csvout.close()