I want to compute and count all trigrams of a string in SQL Server.
For example if the string is hello
I want the following output:
Trigram Count ------- ----- hel 1 ell 1 llo 1 lo- 1
I want to compute and count all trigrams of a string in SQL Server.
For example if the string is hello
I want the following output:
Trigram Count ------- ----- hel 1 ell 1 llo 1 lo- 1
I still don't know what an n-gram is but based on Ed's answer is this what you need?
declare @string varchar(max) = 'hello'
declare @n int = 3
set @string = @string + REPLICATE('-',@n - (len(@string) % @n))
;with n as
(
SELECT 1 AS i
UNION ALL
SELECT i+1
FROM n
WHERE i <= (LEN(@string)-@n)
)
select SUBSTRING(@string, i, @n), COUNT(*)
from n
group by SUBSTRING(@string, i, @n)
option (maxrecursion 0)
Based on Martin Smith's answer - added logic to pad the string out with -
to a number of characters divisible by 3
declare @string varchar(max) = 'hello'
SET @string = (SELECT CASE LEN(@string) % 3
WHEN 1 THEN @string + '--'
WHEN 2 THEN @string + '-'
ELSE @string
END )
;with n as
(
SELECT 1 AS i
UNION ALL
SELECT i+1
FROM n
WHERE i < (LEN(@string)-2)
)
select SUBSTRING(@string, i, 3) AS Trigram, COUNT(*) AS Count
from n
group by SUBSTRING(@string, i, 3)
option (maxrecursion 0)
Borrowing from Ed and Martin, I think this is a correct implementation:
declare @string varchar(max) = 'here kitty kitty'
SET @string = replace(@string, ' ', '-') --Wikipedia says this should be underscore, not dash
;with n as
(
SELECT 1 AS i
UNION ALL
SELECT i + 1
FROM n
WHERE i < (LEN(@string)-2)
)
select SUBSTRING(@string, i, 3) AS Trigram, COUNT(*) AS Count
from n
group by SUBSTRING(@string, i, 3)
option (maxrecursion 0)