views:

285

answers:

5

Hello,

I have a table which logs the userid, course, sessionid and requestdate each time a webpage is loaded. I want to calcuate the duration per userid for a given courseid. It is problematic to do this due to overlapping timespans.

The data provided here should result in 10 minutes duration per user for course 1. I can't seem to get this right.

CREATE TABLE PageLogSample (
    id INT NOT NULL PRIMARY KEY IDENTITY
,   userid INT
,   courseid INT
,   sessionid INT
,   requestdate DATETIME
);

TRUNCATE TABLE PageLogSample;

INSERT INTO PageLogSample (userid, courseid, sessionid, requestdate)
-- [0, 10] = 10 minutes
          SELECT 1, 1, 1, '00:00:00'
UNION ALL SELECT 1, 1, 1, '00:10:00'
-- [0, 12] - [3, 5] = 10 minutes
-- or ... [0, 3] + [5, 12] = 10 minutes
UNION ALL SELECT 2, 1, 2, '00:00:00'
UNION ALL SELECT 2, 2, 2, '00:03:00'
UNION ALL SELECT 2, 2, 2, '00:05:00'
UNION ALL SELECT 2, 1, 2, '00:12:00'
-- [0, 12] - [3, 5] = 10 minutes
-- or ... [0, 3] + [5, 12] = 10 minutes
UNION ALL SELECT 3, 1, 3, '00:00:00'
UNION ALL SELECT 3, 2, 3, '00:03:00'
UNION ALL SELECT 3, 2, 3, '00:05:00'
UNION ALL SELECT 3, 1, 3, '00:12:00'
UNION ALL SELECT 3, 2, 3, '00:15:00'
-- [1, 13] - [3, 5] = 10 minutes
-- or ... [1, 3] + [5, 13] = 10 minutes
UNION ALL SELECT 4, 2, 4, '00:00:00'
UNION ALL SELECT 4, 1, 4, '00:01:00'
UNION ALL SELECT 4, 2, 4, '00:03:00'
UNION ALL SELECT 4, 2, 4, '00:05:00'
UNION ALL SELECT 4, 1, 4, '00:13:00'
UNION ALL SELECT 4, 2, 4, '00:15:00'
-- [0, 5] + [10, 15] = 10 minutes
UNION ALL SELECT 5, 1, 5, '00:00:00'
UNION ALL SELECT 5, 1, 5, '00:05:00'
UNION ALL SELECT 5, 1, 6, '00:10:00'
UNION ALL SELECT 5, 1, 6, '00:15:00'
-- [0, 10] = 10 minutes (ignoring everything inbetween)
UNION ALL SELECT 6, 1, 7, '00:00:00'
UNION ALL SELECT 6, 1, 7, '00:03:00'
UNION ALL SELECT 6, 1, 7, '00:05:00'
UNION ALL SELECT 6, 1, 7, '00:07:00'
UNION ALL SELECT 6, 1, 7, '00:10:00'
-- [0, 11] - [5, 6] = 10 minutes
-- or ... [0, 3] + [7, 11] = 6 minutes (good)
-- or ... [0, 5] + [7, 11] = 9 minutes (better)
UNION ALL SELECT 7, 1, 8, '00:00:00'
UNION ALL SELECT 7, 1, 8, '00:03:00'
UNION ALL SELECT 7, 2, 8, '00:05:00'
UNION ALL SELECT 7, 2, 8, '00:06:00'
UNION ALL SELECT 7, 1, 8, '00:07:00'
UNION ALL SELECT 7, 1, 8, '00:11:00'
-- [0, 1] + [2, 4] + [5, 7] + [8, 13] = 10
UNION ALL SELECT 8, 1, 9, '00:00:00'
UNION ALL SELECT 8, 2, 9, '00:01:00'
UNION ALL SELECT 8, 1, 9, '00:02:00'
UNION ALL SELECT 8, 1, 9, '00:03:00'
UNION ALL SELECT 8, 2, 9, '00:04:00'
UNION ALL SELECT 8, 1, 9, '00:05:00'
UNION ALL SELECT 8, 1, 9, '00:06:00'
UNION ALL SELECT 8, 2, 9, '00:07:00'
UNION ALL SELECT 8, 1, 9, '00:08:00'
UNION ALL SELECT 8, 1, 9, '00:13:00'
;

first trying the naive approach. This gives mistakes with overlapping parts of the session.

DECLARE @courseid INT;
SET @courseid = 1;

SELECT subquery.userid
, COUNT(DISTINCT subquery.sessionid) AS sessioncount
, SUM(subquery.duration) AS duration
, CASE SUM(subquery.duration) 
    WHEN 10 THEN 'ok' 
    ELSE 'ERROR' 
END
FROM (
    SELECT userid
    , sessionid
    , DATEDIFF(MINUTE, MIN(requestdate), MAX(requestdate)) AS duration
    FROM PageLogSample
    WHERE courseid = @courseid
    GROUP BY userid
    , sessionid
) subquery
GROUP BY subquery.userid
ORDER BY subquery.userid;

-- userid  sessioncount  duration   
-- 1       1             10       ok
-- 2       1             12       ERROR
-- 3       1             12       ERROR
-- 4       1             12       ERROR
-- 5       2             10       ok

Second try. Avoid overlappings. This only works partially.

DECLARE @courseid INT;
SET @courseid = 1;

WITH cte (userid, courseid, sessionid, start, finish, duration)
AS (
    SELECT userid
    , courseid
    , sessionid
    , MIN(requestdate)
    , MAX(requestdate)
    , DATEDIFF(MINUTE, MIN(requestdate), MAX(requestdate))
    FROM PageLogSample
    GROUP BY userid
    , courseid
    , sessionid
)
SELECT naive.userid
, naive.sessioncount
, naive.duration AS naiveduration
, correction.duration AS correctionduration
, naive.duration - ISNULL(correction.duration, 0) AS duration
, CASE naive.duration - ISNULL(correction.duration, 0)
    WHEN 10 THEN 'ok' 
    ELSE 'ERROR' 
END
FROM (
    SELECT cte.userid
    , COUNT(DISTINCT cte.sessionid) AS sessioncount
    , SUM(cte.duration) AS duration
    FROM cte
    WHERE cte.courseid = @courseid
    GROUP BY cte.userid
) naive
LEFT JOIN (
    SELECT errors.userid
    , SUM(errors.duration) AS duration
    FROM cte errors
    WHERE errors.courseid <> @courseid
    AND EXISTS (
        SELECT *
        FROM cte
        WHERE cte.start <= errors.start
        AND cte.finish >= errors.finish
        AND cte.courseid = @courseid
    )
    GROUP BY errors.userid
) correction
ON naive.userid = correction.userid
;

-- userid  sessioncount  naiveduration  correctionduration  duration
-- 1       1             10             NULL                10        ok
-- 2       1             12             2                   10        ok
-- 3       1             12             NULL                12        ERROR
-- 4       1             12             NULL                12        ERROR
-- 5       2             10             NULL                10        ok

Update: Ed Harpers comment Really made me rethink my approach.

So here comes the third trial. Here I first search for which rows represent an entrance into the course and which represent someone leaving. Then I take the sum of all endtimes and substract the sum of all begintimes. I think it is more correct, while not perfect.

DECLARE @courseid INT;
SET @courseid = 1;

WITH numberedcte (rn, id, userid, courseid, sessionid, requestdate)
AS (
    SELECT ROW_NUMBER() OVER (PARTITION BY sessionid, userid ORDER BY id)
    , id
    , userid
    , courseid
    , sessionid
    , requestdate
    FROM PageLogSample
)
, typedcte (rowtype, id, userid, courseid, sessionid, requestdate, nextrequestdate)
AS (
    SELECT CASE
     WHEN previousrequest.courseid = nextrequest.courseid
      THEN 'between'
     WHEN previousrequest.courseid IS NULL
      OR nextrequest.courseid = numberedcte.courseid
      THEN 'begin'
     WHEN nextrequest.courseid IS NULL
      OR previousrequest.courseid = numberedcte.courseid
      THEN 'end'
     ELSE 'error?'
    END AS rowtype
    , numberedcte.id
    , numberedcte.userid
    , numberedcte.courseid
    , numberedcte.sessionid
    , numberedcte.requestdate
    , nextrequest.requestdate
    FROM numberedcte
    LEFT JOIN numberedcte previousrequest
     ON previousrequest.userid = numberedcte.userid
     AND previousrequest.sessionid = numberedcte.sessionid
     AND previousrequest.rn = numberedcte.rn - 1
    LEFT JOIN numberedcte nextrequest
     ON nextrequest.userid = numberedcte.userid
     AND nextrequest.sessionid = numberedcte.sessionid
     AND nextrequest.rn = numberedcte.rn + 1
    WHERE numberedcte.courseid = @courseid
    AND (
     nextrequest.courseid = @courseid
     OR previousrequest.courseid = @courseid
    )
)
, beginsum (userid, value)
AS (
    SELECT userid, SUM(DATEPART(MINUTE, requestdate))
    FROM typedcte
    WHERE rowtype = 'begin'
    GROUP BY userid
)
, endsum (userid, value)
AS (
    SELECT userid, SUM(DATEPART(MINUTE, ISNULL(nextrequestdate, requestdate)))
    FROM typedcte
    WHERE rowtype = 'end'
    GROUP BY userid
)
SELECT beginsum.userid
, endsum.value - beginsum.value AS duration
FROM beginsum
INNER JOIN endsum
    ON beginsum.userid = endsum.userid
;

The only problem here is that I only get output for user 1 and 5 from my original sample data. The added user 6 also gives correct output. The added user 7 gives me a satisfactory output now. User 8 is almost perfect, I miss one minute from the first row to the second.

-- userid  duration
-- 1       10
-- 5       10
-- 6       10
-- 7       9
-- 8       9

I feel like I'm inches away from getting this completely right. The only durations missing are from the pagerequests that didn't happen in groups. Can someone help me find a way to get the lonely pageviews?

Update: Here comes a fourth trial. Here I assign a value to each request and sum them up. It doesn't give me exactlu the output I hoped for, but looks like it could be good enough.

DECLARE @courseid INT;
SET @courseid = 1;

WITH numberedcte (rn, userid, courseid, sessionid, requestdate)
AS (
    SELECT ROW_NUMBER() OVER (PARTITION BY sessionid, userid ORDER BY id)
    , userid
    , courseid
    , sessionid
    , requestdate
    FROM PageLogSample
)
, valuecte (value, userid, courseid, sessionid)
AS (
    SELECT CASE
        --alone
        WHEN ( previousrequest.courseid IS NULL
            OR previousrequest.courseid <> numberedcte.courseid
            )
            AND nextrequest.courseid <> numberedcte.courseid
            THEN DATEDIFF(MINUTE, numberedcte.requestdate, nextrequest.requestdate)
        --between
        WHEN previousrequest.courseid = nextrequest.courseid
            THEN 0
        --begin
        WHEN previousrequest.courseid IS NULL
            OR nextrequest.courseid = numberedcte.courseid
            THEN -1 * DATEPART(MINUTE, numberedcte.requestdate)
        --ignored (end with no next request)
        WHEN nextrequest.courseid IS NULL
            AND previousrequest.courseid <> numberedcte.courseid
            THEN 0
        --end
        WHEN nextrequest.courseid IS NULL
            OR previousrequest.courseid = numberedcte.courseid
            THEN DATEPART(MINUTE, ISNULL(nextrequest.requestdate, numberedcte.requestdate))
        --impossible?
        ELSE 0
    END
    , numberedcte.userid
    , numberedcte.courseid
    , numberedcte.sessionid
    FROM numberedcte
    LEFT JOIN numberedcte previousrequest
        ON previousrequest.userid = numberedcte.userid
        AND previousrequest.sessionid = numberedcte.sessionid
        AND previousrequest.rn = numberedcte.rn - 1
    LEFT JOIN numberedcte nextrequest
        ON nextrequest.userid = numberedcte.userid
        AND nextrequest.sessionid = numberedcte.sessionid
        AND nextrequest.rn = numberedcte.rn + 1
    WHERE numberedcte.courseid = @courseid
)
SELECT userid
, courseid
, COUNT(DISTINCT sessionid) AS sessioncount
, SUM(value) AS duration
FROM valuecte
GROUP BY userid
, courseid
ORDER BY userid
;

As you can see the results are not entirely what I expected.

-- userid  courseid  sessioncount  duration
-- 1       1         1             10
-- 2       1         1              3
-- 3       1         1              6
-- 4       1         1              4
-- 5       1         2             10
-- 6       1         1             10
-- 7       1         1              9
-- 8       1         1             10

Performance is horrible on my local copy of the real database. So if anyone has ideas as to write this in a more performant way ... shoot.

Update: Performance is up. I added an index and it works a charm now.

A: 

Sorry but I think you have a data problem. Looking at the sample data provided user 2 is in courseid 1 for 12 minutes and courseid 2 for 2 minutes.

Are you sure you have supplied the correct data?

armitage
The data is correct, but is is just difficult to get relevant meaning out of it. User 2 starts in course 1, goes to course 2 for two minutes and then returns to course 1. I want the time he spent in course 1 (10 minutes). Thus 12 minutes minus the 2 minutes he spent in another course.
Kristof Neirynck
It seems you were right. My original interpretation of the data was flawed.
Kristof Neirynck
A: 

This is as close as I can get. It fails for userid 4.

As I said in my comment, requestdate is sometimes a start and sometimes an end of a course, and I can't see a simple general rule for deriving which role it plays on a given row.

DECLARE @courseid INT;
SET @courseid = 1;

WITH orderCTE
AS
(
        SELECT *

               ,ROW_NUMBER() OVER (PARTITION BY sessionid
                                   ORDER BY id
                                  ) AS rn
        FROM PageLogSample
        --order by rn
)
,startendCTE
AS
(
        SELECT  CASE WHEN start1.rn = 1
                     THEN start1.courseid
                     ELSE end1.courseid
                 END courseid
                ,start1.sessionid
                ,start1.userid
                ,DATEDIFF(mi,start1.requestdate,end1.requestdate) duration
        FROM orderCTE AS start1
        JOIN orderCTE AS end1
        ON end1.rn = start1.rn + 1
        AND end1.sessionid = start1.sessionid
)
SELECT courseid
       ,COUNT(1) sessionCount
       ,userid
       ,SUM(duration) totalDuration
FROM startendCTE
WHERE courseid = @courseid
GROUP BY courseid
         ,userid;
Ed Harper
I like the idea of searching for the begin- and end-rows first. You've inspired me to take a new approach.
Kristof Neirynck
A: 

This is pretty messy, but it appears to be working for CourseID 1. I didn't try it with other courses, so you may want to test that! :D

The basic premise is that I'm getting the duration of time between the first and last session of the target CourseID and then I'm subtracting the duration of any sessions that were not of the specified CourseID, but where the session request time fell within the min and max request times of the targeted CourseID. I hope that makes sense.

The Query could definitely be cleaned up, possibly with a CTE or something. Interesting question BTW! :)

DECLARE @courseid INT;
SET @courseid = 1;

SELECT 
    TargetCourse.UserID, 
    COUNT(Distinct(TargetCourse.SessionID)) as SessionCount,
    SUM(TargetCourse.Duration - Coalesce(OtherCourses.Duration,0)) as Duration
FROM
(
    SELECT 
     TargetCourse.UserID, TargetCourse.SessionID, 
     MIN(TargetCourse.RequestDate) FirstRequest, MAX(TargetCourse.RequestDate) LastRequest, 
     DATEDIFF(MINUTE, MIN(TargetCourse.RequestDate), MAX(TargetCourse.RequestDate)) AS duration
    FROM 
     PageLogSample TargetCourse
    WHERE
     TargetCourse.CourseID = @courseid
    GROUP BY
     TargetCourse.UserID, TargetCourse.SessionID  
) as TargetCourse
LEFT OUTER JOIN
(
    SELECT 
     OtherCourses.UserID, OtherCourses.SessionID, 
     MIN(OtherCourses.RequestDate) AS FirstRequest, MAX(OtherCourses.RequestDate) AS LastRequest, 
     DATEDIFF(MINUTE, MIN(OtherCourses.RequestDate), MAX(OtherCourses.RequestDate)) AS duration
    FROM 
     PageLogSample OtherCourses
    WHERE
     OtherCourses.CourseID <> @courseid AND
     OtherCourses.RequestDate between
      (Select MIN(RequestDate) From PageLogSample T Where T.UserID = OtherCourses.UserID and T.CourseID = @courseid) AND
      (Select MAX(RequestDate) From PageLogSample T Where T.UserID = OtherCourses.UserID and T.CourseID = @courseid)
    GROUP BY
     OtherCourses.UserID, OtherCourses.SessionID 
) as OtherCourses ON
OtherCourses.UserID = TargetCourse.UserID AND
OtherCourses.FirstRequest BETWEEN TargetCourse.FirstRequest and TargetCourse.LastRequest
Group By TargetCourse.UserID
WesleyJohnson
Very good, but not perfect yet. When you search for the time to subtract you need to keep in mind there could be more than one "gap" with the some courseid. I added an extra user to the sample data to show this.
Kristof Neirynck
Aha, thanks for pointing that out. I'll populate the additional user and give it another go. :)
WesleyJohnson
A: 

"The data is correct, but is is just difficult to get relevant meaning out of it."

I'm pressed to respond that this a contradiction of terms. Data of which you don't know what it means is not data.

As for your original question :

What you need is a DBMS that offers decent support for INTERVAL types. No SQL system plays in that league. Apart from a few tutorial systems, my own DBMS (no further pushing that in this context, so no link) is the only one I know of that offers the kind of support that is really needed for such problems.

If you're interested, google around for "interval types", "packed normal form", "temporal data" and you'll run into it eventually.

Erwin Smout
A: 

Some more sample data and a hopefully logical assumption of how much time each user spent in each course.

INSERT INTO PageLogSample (userid, courseid, sessionid, requestdate)
-- [0, 10] = 10 minutes
          SELECT 1, 1, 1, '00:00:00'
UNION ALL SELECT 1, 1, 1, '00:10:00'
-- [0, 3] = 3 minutes
-- there is no way to know how long the user was on that last page
UNION ALL SELECT 2, 1, 2, '00:00:00'
UNION ALL SELECT 2, 2, 2, '00:03:00'
UNION ALL SELECT 2, 2, 2, '00:05:00'
UNION ALL SELECT 2, 1, 2, '00:12:00'
-- [0, 3] + [12, 15] = 6 minutes
-- the [5, 12] part was spent on a page of course 2
UNION ALL SELECT 3, 1, 3, '00:00:00'
UNION ALL SELECT 3, 2, 3, '00:03:00'
UNION ALL SELECT 3, 2, 3, '00:05:00'
UNION ALL SELECT 3, 1, 3, '00:12:00'
UNION ALL SELECT 3, 2, 3, '00:15:00'
-- [1, 3] + [13, 15] = 4 minutes
UNION ALL SELECT 4, 2, 4, '00:00:00'
UNION ALL SELECT 4, 1, 4, '00:01:00'
UNION ALL SELECT 4, 2, 4, '00:03:00'
UNION ALL SELECT 4, 2, 4, '00:05:00'
UNION ALL SELECT 4, 1, 4, '00:13:00'
UNION ALL SELECT 4, 2, 4, '00:15:00'
-- [0, 5] + [10, 15] = 10 minutes
UNION ALL SELECT 5, 1, 5, '00:00:00'
UNION ALL SELECT 5, 1, 5, '00:05:00'
UNION ALL SELECT 5, 1, 6, '00:10:00'
UNION ALL SELECT 5, 1, 6, '00:15:00'
-- [0, 10] = 10 minutes (ignoring everything inbetween)
UNION ALL SELECT 6, 1, 7, '00:00:00'
UNION ALL SELECT 6, 1, 7, '00:03:00'
UNION ALL SELECT 6, 1, 7, '00:05:00'
UNION ALL SELECT 6, 1, 7, '00:07:00'
UNION ALL SELECT 6, 1, 7, '00:10:00'
-- [0, 5] + [7, 11] = 9 minutes
UNION ALL SELECT 7, 1, 8, '00:00:00'
UNION ALL SELECT 7, 1, 8, '00:03:00'
UNION ALL SELECT 7, 2, 8, '00:05:00'
UNION ALL SELECT 7, 2, 8, '00:06:00'
UNION ALL SELECT 7, 1, 8, '00:07:00'
UNION ALL SELECT 7, 1, 8, '00:11:00'
-- [0, 1] + [2, 4] + [5, 7] + [8, 13] = 10
UNION ALL SELECT 8, 1, 9, '00:00:00'
UNION ALL SELECT 8, 2, 9, '00:01:00'
UNION ALL SELECT 8, 1, 9, '00:02:00'
UNION ALL SELECT 8, 1, 9, '00:03:00'
UNION ALL SELECT 8, 2, 9, '00:04:00'
UNION ALL SELECT 8, 1, 9, '00:05:00'
UNION ALL SELECT 8, 1, 9, '00:06:00'
UNION ALL SELECT 8, 2, 9, '00:07:00'
UNION ALL SELECT 8, 1, 9, '00:08:00'
UNION ALL SELECT 8, 1, 9, '00:13:00'
-- there is nothing we can say about either of there requests
-- 0 minutes
UNION ALL SELECT 9, 1, 10, '00:10:00'
UNION ALL SELECT 9, 1, 11, '00:20:00'
;

Now we get our data like this:

WITH numberedcte (rn, userid, courseid, sessionid, requestdate)
AS (
    SELECT ROW_NUMBER() OVER (PARTITION BY sessionid, userid ORDER BY id)
    , userid
    , courseid
    , sessionid
    , requestdate
    FROM PageLogSample
)
, valuecte (value, userid, courseid, sessionid)
AS (
    SELECT CASE
     --alone in session
     WHEN previousrequest.courseid IS NULL
      AND nextrequest.courseid  IS NULL
      THEN 0
     --alone
     WHEN ( previousrequest.courseid IS NULL
      OR previousrequest.courseid <> numberedcte.courseid
      )
      AND nextrequest.courseid <> numberedcte.courseid
      THEN DATEDIFF(MINUTE, numberedcte.requestdate, nextrequest.requestdate)
     --between
     WHEN previousrequest.courseid = nextrequest.courseid
      THEN 0
     --begin
     WHEN previousrequest.courseid IS NULL
      OR nextrequest.courseid = numberedcte.courseid
      THEN -1 * DATEPART(MINUTE, numberedcte.requestdate)
     --ignored (end with no next request)
     WHEN nextrequest.courseid IS NULL
      AND previousrequest.courseid <> numberedcte.courseid
      THEN 0
     --end
     WHEN nextrequest.courseid IS NULL
      OR previousrequest.courseid = numberedcte.courseid
      THEN DATEPART(MINUTE, ISNULL(nextrequest.requestdate, numberedcte.requestdate))
     --impossible?
     ELSE 0
    END
    , numberedcte.userid
    , numberedcte.courseid
    , numberedcte.sessionid
    FROM numberedcte
    LEFT JOIN numberedcte previousrequest
     ON previousrequest.userid = numberedcte.userid
     AND previousrequest.sessionid = numberedcte.sessionid
     AND previousrequest.rn = numberedcte.rn - 1
    LEFT JOIN numberedcte nextrequest
     ON nextrequest.userid = numberedcte.userid
     AND nextrequest.sessionid = numberedcte.sessionid
     AND nextrequest.rn = numberedcte.rn + 1
    WHERE numberedcte.courseid = @courseid
)
SELECT userid
, courseid
, COUNT(DISTINCT sessionid) AS sessioncount
, SUM(value) AS duration
FROM valuecte
GROUP BY userid
, courseid
ORDER BY userid
;

This is the result I get. I'm quite pleased with it. Notice how the session count remains correct for user 9.

userid  courseid  sessioncount  duration
1       1         1             10
2       1         1              3
3       1         1              6
4       1         1              4
5       1         2             10
6       1         1             10
7       1         1              9
8       1         1             10
9       1         2              0
Kristof Neirynck