The following exercises are constructed in ascending difficulty order. First ones are pretty easy, made for warm-up. The advanced exercises, on the other hand, can be quite challenging.
-- Basic Analytics
-- 1. How many post were made in each year
SELECT COUNT(*) number_of_posts, YEAR(CreationDate) creation_year
FROM posts
GROUP BY YEAR(CreationDate)
-- 2. How many votes were made in each day of the week (Sunday, Monday, Tuesday, etc.) ?
SELECT COUNT(*) AS 'Number of Posts', DATENAME(DW, CreationDate) AS 'Day of the Week'
FROM votes
GROUP BY DATENAME(DW, CreationDate)
-- 3. List all comments created on 2012-12-19
SELECT *
FROM comments
WHERE CAST(creationDate AS DATE) = '2012-12-19'
-- Or
SELECT *
FROM comments
WHERE DATEDIFF(DAY, CreationDate, '2012-12-19') = 0
-- 4. List all users under the age of 33, living in London
SELECT *
FROM users
WHERE age < 33
AND location LIKE '%London%'
-- Advanced Analytics
-- 1. Display the number of votes for each post title
SELECT pst.Title, COUNT(*) number_of_votes
FROM posts pst JOIN votes vt
ON pst.Id = vt.postID
GROUP BY pst.Title
ORDER BY COUNT(*) DESC
-- 2. Display comments created by users living in the same location as the post creator
SELECT pst.Id AS 'post_id',
pst.Title AS 'post_title',
pst.OwnerUserID AS 'created_by_user',
usr_p.Id AS 'user_id',
usr_p.DisplayName AS 'creator_user_name',
usr_p.location AS 'creator_location',
cmt.UserId AS 'commentor_id',
usr_c.DisplayName AS 'commentor_user_name',
usr_c.location AS 'commentor_location'
FROM posts pst JOIN users usr_p
ON pst.OwnerUserID = usr_p.Id
JOIN comments cmt
ON cmt.postId = pst.Id
JOIN users usr_c
ON cmt.UserID = usr_c.Id
WHERE usr_c.location = usr_p.location
-- 3. How many users have never voted ?
WITH "VOTE-CTE" AS
(
SELECT id FROM users
EXCEPT
SELECT userID FROM votes
)
SELECT COUNT(*)
FROM "VOTE-CTE"
-- 4. Display all posts having the highest amount of comments
WITH "TOP-COMMENT-POSTS"
AS
(
SELECT pst.Title,
COUNT(*) AS 'number_of_comments' ,
DENSE_RANK() OVER (ORDER BY COUNT(*) DESC) AS 'comment_count_ranking'
FROM posts pst JOIN comments cmt
ON pst.id = cmt.postid
GROUP BY pst.Title
)
SELECT Title
FROM "TOP-COMMENT-POSTS"
WHERE comment_count_ranking = 1
-- 5. For each post, how many votes are coming from users living in Canada ?
-- What's their percentage of the total number of votes
SELECT pst.Title,
COUNT(*) number_of_votes,
SUM(CASE WHEN usr.location LIKE '%canada%' THEN 1 ELSE 0 END) AS 'votes_from_canada',
CAST(SUM(CASE WHEN usr.location LIKE '%canada%' THEN 1 ELSE 0 END) AS FLOAT) /
CAST(COUNT(*) AS FLOAT) AS 'votes_percentage'
FROM posts pst JOIN votes vt
ON pst.Id = vt.postID
JOIN users usr
ON vt.UserID = usr.id
GROUP BY pst.Title
ORDER BY COUNT(*) DESC
-- 6. How many hours in average, it takes to the first comment to be posted after a creation of a new post
;WITH "COMMENTS-TIMING-CTE"
AS
(
SELECT pst.id AS 'post_id',
pst.Title AS 'post_title',
pst.creationDate AS 'post_creation_date',
cmt.creationDate AS 'comment_creation_date',
DENSE_RANK() OVER (PARTITION BY pst.id ORDER BY cmt.creationDate) AS 'comment_rank'
FROM posts pst JOIN comments cmt
ON pst.Id = cmt.postID
)
SELECT AVG(DATEDIFF(HOUR, post_creation_date, comment_creation_date)) AS 'avg_num_of_hours'
FROM "COMMENTS-TIMING-CTE"
WHERE comment_rank = 1
-- 7. Whats the most common post tag ?
-- Note, each post may have 1 or more tags.
-- The goal of this question is to find the most common *single* tag
;WITH "CTE-TAGS-SEP" (Tags) AS
(
SELECT CAST(Tags AS VARCHAR(MAX))
FROM Posts
UNION ALL
SELECT STUFF(Tags, 1, CHARINDEX('><' , Tags), '')
FROM "CTE-TAGS-SEP"
WHERE Tags LIKE '%><%'
), "CTE-TAGS-COUNTER" AS
(
SELECT CASE WHEN Tags LIKE '%><%' THEN LEFT(Tags, CHARINDEX('><' , Tags))
ELSE Tags
END AS 'Tags'
FROM "CTE-TAGS-SEP"
)
SELECT TOP 1 COUNT(*), Tags
FROM "CTE-TAGS-COUNTER"
GROUP BY Tags
ORDER BY COUNT(*) DESC
-- Solution by YB
SELECT REPLACE(REPLACE(REPLACE(Tags, '><', ','), '<',''),'>','') 'TAGS' INTO #TABI1
FROM posts
SELECT VALUE, COUNT(*)
FROM #TABI1
CROSS APPLY string_split(TAGS, ',')
GROUP BY VALUE
ORDER BY COUNT(*) DESC
-- 8. Create a pivot table displaying how many posts were created for each year (Y axis) and each month (X axis)
SELECT *
FROM (
SELECT YEAR(CreationDate) AS 'Year', DATENAME(MONTH,CreationDate) AS 'Month', id
FROM posts
) AS S
PIVOT
(
COUNT(id)
FOR [Month] IN ([January], [February], [March], [April], [May], [June], [July], [August], [September], [October], [November], [December])
) AS PVT
ORDER BY [Year]