try something like this : http://pastie.org/1213943
-- TABLES
drop table if exists staging;
create table staging
(
person_id int unsigned not null primary key,
name varchar(255) not null,
states_csv varchar(1024)
)
engine=innodb;
drop table if exists persons;
create table persons
(
person_id int unsigned not null primary key,
name varchar(255) not null
)
engine=innodb;
drop table if exists states;
create table states
(
state_id tinyint unsigned not null auto_increment primary key, -- i want a nice new integer based PK
state_code varchar(3) not null unique, -- original state code from staging
name varchar(255) null
)
engine=innodb;
/*
you might want to make the person_states primary key (person_id, state_id) depending on
your queries as this is currently optimised for queries like - select all the people from NY
*/
drop table if exists person_states;
create table person_states
(
state_id tinyint unsigned not null,
person_id int unsigned not null,
primary key(state_id, person_id),
key (person_id)
)
engine=innodb;
-- STORED PROCEDURES
drop procedure if exists load_staging_data;
delimiter #
create procedure load_staging_data()
proc_main:begin
truncate table staging;
-- assume this is done by load data infile...
set autocommit = 0;
insert into staging values
(1234,'Sally','NY~NJ~CT'),
(1235,'Dave','ME~MA~FL'),
(3245,'Fred','UT~CA'),
(2344,'Sue','OR'),
(5555,'f00','OR~NY');
commit;
end proc_main #
delimiter ;
drop procedure if exists cleanse_map_staging_data;
delimiter #
create procedure cleanse_map_staging_data()
proc_main:begin
declare v_cursor_done tinyint unsigned default 0;
-- watch out for variable names that have the same names as fields !!
declare v_person_id int unsigned;
declare v_states_csv varchar(1024);
declare v_state_code varchar(3);
declare v_state_id tinyint unsigned;
declare v_states_done tinyint unsigned;
declare v_states_idx int unsigned;
declare v_staging_cur cursor for select person_id, states_csv from staging order by person_id;
declare continue handler for not found set v_cursor_done = 1;
-- do the person data
set autocommit = 0;
insert ignore into persons (person_id, name)
select person_id, name from staging order by person_id;
commit;
-- ok now we have to use the cursor !!
set autocommit = 0;
open v_staging_cur;
repeat
fetch v_staging_cur into v_person_id, v_states_csv;
-- clean up the data (for example)
set v_states_csv = upper(trim(v_states_csv));
-- split the out the v_states_csv and insert
set v_states_done = 0;
set v_states_idx = 1;
while not v_states_done do
set v_state_code = substring(v_states_csv, v_states_idx,
if(locate('~', v_states_csv, v_states_idx) > 0,
locate('~', v_states_csv, v_states_idx) - v_states_idx,
length(v_states_csv)));
set v_state_code = trim(v_state_code);
if length(v_state_code) > 0 then
set v_states_idx = v_states_idx + length(v_state_code) + 1;
-- add the state if it doesnt already exist
insert ignore into states (state_code) values (v_state_code);
select state_id into v_state_id from states where state_code = v_state_code;
-- add the person state
insert ignore into person_states (state_id, person_id) values (v_state_id, v_person_id);
else
set v_states_done = 1;
end if;
end while;
until v_cursor_done end repeat;
close v_staging_cur;
commit;
end proc_main #
delimiter ;
-- TESTING
call load_staging_data();
select * from staging;
call cleanse_map_staging_data();
select * from states order by state_id;
select * from persons order by person_id;
select * from person_states order by state_id, person_id;