here's the code!
import csv
def do_work():
global data
global b
get_file()
samples_subset1()
return
def get_file():
start_file='thefile.csv'
with open(start_file, 'rb') as f:
data = list(csv.reader(f))
import collections
counter = collections.defaultdict(int)
for row in data:
counter[row[10]] += 1
return
def samples_subset1():
with open('/pythonwork/samples_subset1.csv', 'wb') as outfile:
writer = csv.writer(outfile)
sample_cutoff=5000
b_counter=0
global b
b=[]
for row in data:
if counter[row[10]] >= sample_cutoff:
global b
b.append(row)
writer.writerow(row)
#print b[b_counter]
b_counter+=1
return
i am a beginner at python. the way my code runs is i call do_work and do_Work will call the other functions. here are my questions:
if i need
data
to be seen by only 2 functions should i make it global? if not then how should i callsamples_subset1
? should i call it fromget_file
or fromdo_work
?the code works but can you please point other good/bad things about the way it is written?
i am processing a csv file and there are multiple steps. i am breaking down the steps into different functions like
get_file
,samples_subset1
, and there are more that i will add. should i continue to do it the way i am doing it right now here i call each individual function fromdo_work
?
here is the new code, according to one of the answers below:
import csv
import collections
def do_work():
global b
(data,counter)=get_file('thefile.csv')
samples_subset1(data, counter,'/pythonwork/samples_subset1.csv')
return
def get_file(start_file):
with open(start_file, 'rb') as f:
global data
data = list(csv.reader(f))
counter = collections.defaultdict(int)
for row in data:
counter[row[10]] += 1
return (data,counter)
def samples_subset1(data,counter,output_file):
with open(output_file, 'wb') as outfile:
writer = csv.writer(outfile)
sample_cutoff=5000
b_counter=0
global b
b=[]
for row in data:
if counter[row[10]] >= sample_cutoff:
global b
b.append(row)
writer.writerow(row)
#print b[b_counter]
b_counter+=1
return