The previous post described how to load dimension tables with PyGramETL. This one is about fact tables.
The source CVS file has the following structure:
cust_id;res_date;future_guests;future_revenue 105;2008/04/01 00:00:00.000;2;300 106;2007/04/17 00:00:00.000;4;380 106;2007/04/17 00:00:00.000;4;768 ...
Fact Table
CREATE TABLE reservation_fact( customer_sk INT, reservation_date DATETIME, future_guests FLOAT, future_revenue FLOAT )
Kettle
PyGramETL
import MySQLdb import pygrametl from pygrametl.datasources import CSVSource from pygrametl.tables import SlowlyChangingDimension, BatchFactTable myconn = MySQLdb.connect(user='test', passwd='test', db='test') connection = pygrametl.ConnectionWrapper(myconn) connection.setasdefault() customer_dim = SlowlyChangingDimension( name='customer_dim', key='customer_sk', attributes=['customer_id', 'valid_from', 'valid_to', 'version', 'first_name', 'last_name', 'age', 'phone_number', 'address'], lookupatts=['customer_id'], versionatt='version', fromatt='valid_from', toatt='valid_to', type1atts=('first_name', 'last_name'), cachesize=-1) customer_dim.defaultidvalue = 0 reservation_fact = BatchFactTable( name='reservation_fact', keyrefs=['customer_sk'], measures=['reservation_date', 'future_guests', 'future_revenue']) def load_customer_dim(): customer = CSVSource(file('./customer.txt', 'r', 16384), delimiter=';') for row in customer: customer_dim.scdensure(row) connection.commit() def load_reservation_fact(): reservation = CSVSource(file('./reservation.txt', 'r', 16384), delimiter=';') for row in reservation: row['customer_sk'] = customer_dim.lookup(row, {'customer_id':'cust_id'}) row['reservation_date'] = row['res_date'] reservation_fact.insert(row) connection.commit() def main(): #load_customer_dim() load_reservation_fact() if __name__ == '__main__': main()