Let's say you have a table with columns, Date, GroupID, X and Y.
CREATE TABLE #sample
(
[Date] DATETIME,
GroupID INT,
X FLOAT,
Y FLOAT
)
DECLARE @date DATETIME = getdate()
INSERT INTO #sample VALUES(@date, 1, 1,3)
INSERT INTO #sample VALUES(DATEADD(d, 1, @date), 1, 1,1)
INSERT INTO #sample VALUES(DATEADD(d, 2, @date), 1, 4,2)
INSERT INTO #sample VALUES(DATEADD(d, 3, @date), 1, 3,3)
INSERT INTO #sample VALUES(DATEADD(d, 4, @date), 1, 6,4)
INSERT INTO #sample VALUES(DATEADD(d, 5, @date), 1, 7,5)
INSERT INTO #sample VALUES(DATEADD(d, 6, @date), 1, 1,6)
and you want to calculate the correlation of X and Y for each group. Currently I use CTEs which get a little messy:
;WITH DataAvgStd
AS (SELECT GroupID,
AVG(X) AS XAvg,
AVG(Y) AS YAvg,
STDEV(X) AS XStdev,
STDEV(Y) AS YSTDev,
COUNT(*) AS SampleSize
FROM #sample
GROUP BY GroupID),
ExpectedVal
AS (SELECT s.GroupID,
SUM(( X - XAvg ) * ( Y - YAvg )) AS ExpectedValue
FROM #sample s
JOIN DataAvgStd das
ON s.GroupID = das.GroupID
GROUP BY s.GroupID)
SELECT das.GroupID,
ev.ExpectedValue / ( das.SampleSize - 1 ) / ( das.XStdev * das.YSTDev )
AS
Correlation
FROM DataAvgStd das
JOIN ExpectedVal ev
ON das.GroupID = ev.GroupID
DROP TABLE #sample
It seems like there should be a way to use OVER and PARTITION to do this in one fell swoop without any subqueries. Ideally TSQL would have a function so you could write:
SELECT GroupID, CORR(X, Y) OVER(PARTITION BY GroupID)
FROM #sample
GROUP BY GroupID
Using this formula of corellation you cannot avoid all the nested queries even if you use over()
. The thing is that you cannot use both group by and over in the same query and also you can not have nested aggregation functions e.g. sum(x - avg(x))
. So you in best case scenario, according to your data, you will need to keep at least the with
.
Your code will look like something like that
;WITH DataAvgStd
AS (SELECT GroupID,
STDEV(X) over(partition by GroupID) AS XStdev,
STDEV(Y) over(partition by GroupID) AS YSTDev,
COUNT(*) over(partition by GroupID) AS SampleSize,
( X - AVG(X) over(partition by GroupID)) * ( Y - AVG(Y) over(partition by GroupID)) AS ExpectedValue
FROM #sample s)
SELECT distinct GroupID,
SUM(ExpectedValue) over(partition by GroupID) / (SampleSize - 1 ) / ( XStdev * YSTDev ) AS Correlation
FROM DataAvgStd
An alternative is to use an equilevant formula for correlation as Wikipedia describes.
This can be written as
SELECT GroupID,
Correlation=(COUNT(*) * SUM(X * Y) - SUM(X) * SUM(Y)) /
(SQRT(COUNT(*) * SUM(X * X) - SUM(X) * SUM(x))
* SQRT(COUNT(*) * SUM(Y* Y) - SUM(Y) * SUM(Y)))
FROM #sample s
GROUP BY GroupID;