Hello, thanks very much for all your comments. They were very useful. I wrote my code using the directory functionality. What it does is it reads through the file and creates an output file for each of the user with all his status updates. Here is the code pasted below.
Script to extract info from individual data files and print out a data file combining info from these files
import os
import commands
dataFileDir="data/";
Dictionary linking names to email ids
For the time being, assume no 2 people have the same name
usrName2Id={};
User id to user name mapping to check for duplicate names
usrId2Name={};
Store info: key: user ids and values a dictionary with time stamp keys and status messages values
infoDict={};
Given an array of space tokenized inputs, extract user name
def getUserName(info,mailInd):
userName="";
for i in range(mailInd-1,0,-1):
if info[i].endswith("-") or info[i].endswith("+"):
break;
userName=info[i]+" "+userName;
userName=userName.strip();
userName=userName.replace(" "," ");
userName=userName.replace(" ","_");
return userName;
Given an array of space tokenized inputs, extract time stamp
def getTimeStamp(info,timeStartInd):
timeStamp="";
for i in range(timeStartInd+1,len(info)):
timeStamp=timeStamp+" "+info[i];
timeStamp=timeStamp.replace("-","");
timeStamp=timeStamp.strip();
return timeStamp;
Given an array of space tokenized inputs, extract status message
def getStatusMsg(info,startInd,endInd):
msg="";
for i in range(startInd,endInd):
msg=msg+" "+info[i];
msg=msg.strip();
msg=msg.replace(" ","_");
return msg;
Extract and store info from each line in the datafile
def extractLineInfo(line):
print line;
info=line.split(" ");
mailInd=-1;userId="-NONE-";
timeStartInd=-1;timeStamp="-NONE-";
becameInd="-1";
statusMsg="-NONE-";
#Find indices of email id and "@" char indicating start of timestamp
for i in range(0,len(info)):
#print (str(i)+" "+info[i]);
if(info[i].startswith("(") and info[i].endswith("@in.ibm.com)")):
mailInd=i;
if(info[i]=="@"):
timeStartInd=i;
if(info[i]=="became"):
becameInd=i;
#Debug print of mail and time stamp start inds
"""print "\n";
print "Index of mail id: "+str(mailInd);
print "Index of time start index: "+str(timeStartInd);
print "\n";"""
#Extract IBM user id and name for lines with ibm id
if(mailInd>=0):
userId=info[mailInd].replace("(","");
userId=userId.replace(")","");
userName=getUserName(info,mailInd);
#Lines with no ibm id are of the form "Suraj Godar Mr became idle @ 15/07/2010 16:30:18"
elif(becameInd>0):
userName=getUserName(info,becameInd);
#Time stamp info
if(timeStartInd>=0):
timeStamp=getTimeStamp(info,timeStartInd);
if(mailInd>=0):
statusMsg=getStatusMsg(info,mailInd+1,timeStartInd);
elif(becameInd>0):
statusMsg=getStatusMsg(info,becameInd,timeStartInd);
print userId;
print userName;
print timeStamp
print statusMsg+"\n";
if not(userName in usrName2Id) and not(userName=="-NONE-") and not(userId=="-NONE-"):
usrName2Id[userName]=userId;
#Store status messages keyed by user email ids
timeDict={};
#Retrieve user id corresponding to user name
if userName in usrName2Id:
userId=usrName2Id[userName];
#For valid user ids, store status message in the dict within dict data str arrangement
if not(userId=="-NONE-"):
if not(userId in infoDict.keys()):
infoDict[userId]={};
timeDict=infoDict[userId];
if not(timeStamp in timeDict.keys()):
timeDict[timeStamp]=statusMsg;
else:
timeDict[timeStamp]=timeDict[timeStamp]+" "+statusMsg;
Print for each user a file containing status
def printStatusFiles(dataFileDir):
volNum=0;
for userName in usrName2Id:
volNum=volNum+1;
filename=dataFileDir+"/"+"status-"+str(volNum)+".txt";
file = open(filename,"w");
print "Printing output file name: "+filename;
print volNum,userName,usrName2Id[userName]+"\n";
file.write(userName+" "+usrName2Id[userName]+"\n");
timeDict=infoDict[usrName2Id[userName]];
for time in sorted(timeDict.keys()):
file.write(time+" "+timeDict[time]+"\n");
Read and store data from individual data files
def readDataFiles(dataFileDir):
#Process each datafile
files=os.listdir(dataFileDir)
files.sort();
for i in range(0,len(files)):
#for i in range(0,1):
file=files[i];
#Do not process other non-data files lying around in that dir
if not file.endswith(".txt"):
continue
print "Processing data file: "+file
dataFile=dataFileDir+str(file);
inpFile=open(dataFile,"r");
lines=inpFile.readlines();
#Process lines
for line in lines:
#Clean lines
line=line.strip();
line=line.replace("/India/Contr/IBM","");
line=line.strip();
#Skip header line of the file and L's sign in sign out times
if(line.startswith("System log for account") or line.find("signed")>-1):
continue;
extractLineInfo(line);
print "\n";
readDataFiles(dataFileDir);
print "\n";
printStatusFiles("out/");