views:

1238

answers:

2

Currently I'm working on an awk script to do some statistical analysis on measurement data. I'm using linear regression to get parameter estimates, standard errors etc. and would also like to compute the p-value for a null-hypothesis test (t-test).

This is my script so far, any idea how to compute the p-value?

BEGIN {
    ybar = 0.0
    xbar = 0.0
    n = 0
    a0 = 0.0
    b0 = 0.0
    qtinf0975 = 1.960 # 5% n = inf
}

{ # y_i is in $1, x_i has to be counted
    n = n + 1
    yi[n] = $1*1.0
    xi[n] = n*1.0
}

END {
    for ( i = 1; i <= n ; i++ ) {
     ybar = ybar + yi[i]
     xbar = xbar + xi[i]
    }
    ybar = ybar/(n*1.0)
    xbar = xbar/(n*1.0)

    bhat = 0.0
    ssqx = 0.0
    for ( i = 1; i <= n; i++ ) {
     bhat = bhat + (yi[i] - ybar)*(xi[i] - xbar)
     ssqx = ssqx + (xi[i] - xbar)*(xi[i] - xbar)
    }
    bhat = bhat/ssqx
    ahat = ybar - bhat*xbar

    print "n: ", n
    print "alpha-hat: ", ahat
    print "beta-hat: ", bhat

    sigmahat2 = 0.0
    for ( i = 1; i <= n; i++ ) {
     ri[i] = yi[i] - (ahat + bhat*xi[i])
     sigmahat2 = sigmahat2 + ri[i]*ri[i]
    }
    sigmahat2 = sigmahat2 / ( n*1.0 - 2.0 )

    print "sigma-hat square: ", sigmahat2

    seb = sqrt(sigmahat2/ssqx)

    print "se(b): ", seb

    sigmahat = sqrt((seb*seb)*ssqx)
    print "sigma-hat: ", sigma
    sea = sqrt(sigmahat*sigmahat * ( 1 /(n*1.0) + xbar*xbar/ssqx))

    print "se(a): ", sea


    # Tests

    print "q(inf)(97.5%): ", qtinf0975

    Tb = (bhat - b0) / seb
    if ( qtinf0975 > Tb )
     print "T(b) plausible: ", Tb, " < ", qtinf0975
    else
     print "T(b) NOT plausible: ", Tb, " > ", qtinf0975

    print "confidence(b): [", bhat - seb * qtinf0975,", ", bhat + seb * qtinf0975 ,"]"

    Ta = (ahat - a0) / sea
    if ( qtinf0975 > Ta )
     print "T(a) plausible: ", Ta, " < ", qtinf0975
    else
     print "T(a) NOT plausible: ", Ta, " > ", qtinf0975

    print "confidence(a): [", ahat - seb * qtinf0975,", ", ahat + seb * qtinf0975 ,"]"
}
+1  A: 

You're probably trying to do a paired t-test under the assumption of variance equality. I suggest you have a look at the corresponding entry in the excellent MathWorld website.

lindelof
A: 

OK, I've found a javascript implementation and ported it to awk this are the functions used to compute the p-value:

function statcom ( mq, mi, mj, mb )
{
    zz = 1
    mz = zz
    mk = mi
    while ( mk <= mj ) {
     zz = zz * mq * mk / ( mk - mb)
     mz = mz + zz
     mk = mk + 2
    }
    return mz
}

function studpval ( mt , mn )
{
    PI = 3.1415926535897932384626433832795028841971693993751058209749445923078164062862089986280348253421170679 # thank you wikipedia
    if ( mt < 0 )
     mt = -mt
    mw = mt / sqrt(mn)
    th = atan2(mw, 1)
    if ( mn == 1 )
     return 1.0 - th / (PI/2.0)
    sth = sin(th)
    cth = cos(th)
    if ( mn % 2 == 1 )
     return 1.0 - (th+sth*cth*statcom(cth*cth, 2, mn-3, -1))/(PI/2.0)
    else
     return 1.0 - sth * statcom(cth*cth, 1, mn-3, -1)
}

I've integrated them like this:

    pvalb = studpval(Tb, n)
    if ( pvalb > 0.05 )
     print "p-value(b) plausible: ", pvalb, " > 0.05"
    else
     print "p-value(b) NOT plausible: ", pvalb, " < 0.05"

    pvala = studpval(Ta, n)
    if ( pvala > 0.05 )
     print "p-value(a) plausible: ", pvala, " > 0.05"
    else
     print "p-value(a) NOT plausible: ", pvala, " < 0.05"
x-way