I have a table with ~17 million rows in it. I need to de-duplicate the rows in the table. Under normal circumstances this wouldn't be a challenge, however, this isn't a normal circumstance. Normally 'duplicate rows' is defined as two or more rows containing the exact same values for all columns. In this case 'duplicate rows' is defined as two or more rows that have the exact same values, but are also within 20 seconds of each other. I wrote a script that is still running after 19.5 hours, this isn't acceptable, but I'm not sure how else to do it. Here's the script:
begin
create table ##dupes (ID int)
declare curOriginals cursor for
select ID, AssociatedEntityID, AssociatedEntityType, [Timestamp] from tblTable
declare @ID int
declare @AssocEntity int
declare @AssocType int
declare @Timestamp datetime
declare @Count int
open curOriginals
fetch next from curOriginals into @ID, @AssocEntity, @AssocType, @Timestamp
while @@FETCH_STATUS = 0
begin
select @Count = COUNT(*) from tblTable where AssociatedEntityID = @AssocEntity and AssociatedEntityType = @AssocType
and [Timestamp] >= DATEADD(ss, -20, @Timestamp)
and [Timestamp] <= DATEADD(ss, 20, @Timestamp)
and ID <> @ID
if (@Count > 0)
begin
insert into ##dupes (ID)
(select ID from tblHBMLog where AssociatedEntityID = @AssocEntity and AssociatedEntityType = @AssocType
and [Timestamp] >= DATEADD(ss, -20, @Timestamp)
and [Timestamp] <= DATEADD(ss, 20, @Timestamp)
and ID <> @ID)
print @ID
end
delete from tblHBMLog where ID = @ID or ID in (select ID from ##dupes)
fetch next from curOriginals into @ID, @AssocEntity, @AssocType, @Timestamp
end
close curOriginals
deallocate curOriginals
select * from ##dupes
drop table ##dupes
end
Any help would be greatly appreciated.