TikTok-Like Data Simulation using PostgreSQL

Budi Wijaya Putra
10 min readOct 28, 2024

--

Background

In today’s world, social media platforms are more than just places for interaction, they generate a massive amount of data every second. Analyzing this data helps to understand user behavior, identify trends, and power algorithmic recommendations. Inspired by the mechanics behind TikTok, i started a small project to simulate a social media environment using PostgreSQL.

This simulation doesn’t aim to replicate every feature of TikTok, but serves as a proof of concept for managing key data points.

Let’s dive in!

Database Design and Schema

Schema Design to SQL Queries

CREATE DATABASE tiktok
WITH
OWNER = postgres
ENCODING = 'UTF8'
LOCALE_PROVIDER = 'libc'
CONNECTION LIMIT = -1
IS_TEMPLATE = False;
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(100) NOT NULL,
birth_of_date DATE NOT NULL,
gender VARCHAR(10) NOT NULL,
region VARCHAR(50) NOT NULL,
signup_date TIMESTAMP NOT NULL);

CREATE TABLE video (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
video_length INTEGER NOT NULL,
caption VARCHAR(255) NOT NULL,
upload_date TIMESTAMP NOT NULL ,
hashtags VARCHAR(255),
FOREIGN KEY (user_id) REFERENCES users(id));

CREATE TABLE video_views (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
video_id INTEGER NOT NULL,
view_duration INTEGER NOT NULL,
view_date TIMESTAMP NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id),
FOREIGN KEY (video_id) REFERENCES video(id));

CREATE TABLE user_activity (
id INTEGER PRIMARY KEY,
user_id INTEGER NOT NULL,
video_id INTEGER NOT NULL,
is_like INTEGER NOT NULL CHECK (is_like IN (0, 1)),
is_favorite INTEGER NOT NULL CHECK (is_favorite IN (0, 1)),
is_share INTEGER NOT NULL CHECK (is_share IN (0, 1)),
is_dislike INTEGER NOT NULL CHECK (is_dislike IN (0, 1)),
activity_date TIMESTAMP NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id),
FOREIGN KEY (video_id) REFERENCES video(id));

CREATE TABLE user_comments (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
video_id INTEGER NOT NULL,
comment_text varchar(255) NOT NULL,
comment_date TIMESTAMP NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id),
FOREIGN KEY (video_id) REFERENCES video(id));

CREATE TABLE user_following (
id SERIAL PRIMARY KEY,
is_follow INTEGER NOT NULL CHECK (is_follow IN (0, 1)),
follower_id INTEGER NOT NULL,
followee_id INTEGER NOT NULL,
following_date TIMESTAMP NOT NULL,
FOREIGN KEY (follower_id) REFERENCES users(id),
FOREIGN KEY (followee_id) REFERENCES users(id));

Populate Sample Data

The CSV version is available for access [here]

INSERT INTO users 
(id, username, date_of_birth, gender, region, signup_date)
VALUES
(49, 'Porter York', '1987-11-06', 'Male', 'Peru',
'2020-09-13 13:43:22'),
(40, 'Eugenia Prince', '1986-04-22', 'Male', 'Chile',
'2020-10-12 08:59:57'),
(45, 'Jameson Lindsay', '1997-10-22', 'Male', 'Chile',
'2020-03-16 02:39:45');

INSERT INTO video
(id, user_id, video_length, caption, upload_date, hashtags)
VALUES
(1, 1, 54, 'Peanut & Drache', '2021-08-07 06:30:00',
'#DailyLife, #Gaming, #Health, #Comedy, #Music'),
(2, 14, 43, 'Artistry and inspiration', '2021-01-27 04:46:00',
'#Art, #Comedy, #Beauty, #Dance'),
(3, 25, 40, 'Yay or nay?', '2021-04-04 00:14:00',
'#Music, #Sports, #Science, #Outdoors');

INSERT INTO video_views
(id, user_id, video_id, view_duration, view_date)
VALUES
(1, 27, 1, 15, '2022-11-01 07:41:00'),
(2, 22, 1, 49, '2022-04-13 10:22:00'),
(3, 19, 1, 22, '2022-04-19 17:52:00');

INSERT INTO user_activity
(id, user_id, video_id, is_like, is_favorite, is_share, is_dislike,
activity_date)
VALUES
(6, 6, 1, 0, 0, 1, 0, '2023-04-06 05:37:00'),
(11, 21, 2, 0, 1, 0, 0, '2023-05-19 14:15:00'),
(20, 43, 5, 0, 0, 0, 1, '2023-08-29 22:28:00');

INSERT INTO user_comments
(id, user_id, video_id, comment_text, comment_date)
VALUES
(1, 4, 1, 'So delicious', '2023-05-02 23:29:00'),
(2, 12, 1, 'Lovely', '2023-07-03 13:28:00'),
(3, 19, 2, 'Wonderful match of colors', '2023-09-29 07:39:00');

INSERT INTO user_following
(id, is_follow, follower_id, followee_id, following_date)
VALUES
(1, 1, 12, 1, '2023-01-05 05:04:00'),
(2, 1, 10, 1, '2023-02-10 11:40:00'),
(3, 1, 6, 1, '2023-03-10 18:11:00');

User Behavior Analysis

Case 1. Monthly Follower growth

This metric provides insights into how a user’s popularity changes over time, which can indicate the effectiveness of content, engagement strategies, or even seasonal trends.

Query :

WITH cte_following AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY follower_id, followee_id
ORDER BY following_date DESC ) AS latest_row,
EXTRACT(year from following_date) as following_year,
EXTRACT(month from following_date) as following_month
FROM user_following
ORDER BY following_date)

,monthly_followers AS (
SELECT
followee_id AS user_id,
following_year,
following_month,
COUNT(follower_id) AS new_followers
FROM cte_following
WHERE is_follow=1
GROUP BY followee_id, following_year, following_month
ORDER BY followee_id,following_month)

,cte_cumulative_followers AS (
SELECT
mf.user_id,
u.username,
mf.following_year AS year,
mf.following_month AS month,
mf.new_followers,
SUM(mf.new_followers) OVER(PARTITION BY user_id
ORDER BY mf.following_year , mf.following_month)
AS cumulative_followers
FROM monthly_followers mf
LEFT JOIN users u ON mf.user_id=u.id
ORDER BY mf.user_id, mf.following_year, mf.following_month)

SELECT
user_id,
username,
year,
month,
new_followers,
cumulative_followers,
COALESCE(LAG(cumulative_followers)
OVER(PARTITION BY user_id ORDER BY year, month),0)
AS previous_month_followers,
COALESCE(ROUND((cumulative_followers - LAG(cumulative_followers)
OVER(PARTITION BY user_id ORDER BY year,month)) * 100.0 /
cumulative_followers,2),0)
AS follower_growth_percentage
FROM cte_cumulative_followers

Result :

Monthly followers growth

Case 2. Top 5 Users with Highest Engagement Points

Top users are usually influential or have highly engaging content that resonates well with their audience, which can be valuable for understanding trends, content effectiveness, and potential areas for growth. How to Calculate Engagement Points :

  • Like: 1 point
  • Comment: 1 point
  • Share: 1 point
  • Favorite: 1 point
  • Dislike: -1 point

Query :

WITH cte_activity AS (
SELECT
user_id,
video_id,
is_like,
is_favorite,
is_share,
is_dislike,
activity_date,
ROW_NUMBER() OVER(PARTITION BY user_id, video_id
ORDER BY activity_date DESC)
AS latest_row
FROM user_activity)

,cte_count_activity AS (
SELECT video_id,
COUNT(CASE WHEN is_like = 1 THEN is_like END)
AS number_of_likes,
COUNT(CASE WHEN is_favorite = 1 THEN is_favorite END)
AS number_of_favorites,
COUNT(CASE WHEN is_share = 1 THEN is_share END)
AS number_of_shares,
COUNT(CASE WHEN is_dislike = 1 THEN is_dislike END)
AS number_of_dislikes
FROM cte_activity
WHERE latest_row=1
GROUP BY video_id)

,cte_count_comments AS (
SELECT video_id,
COUNT(comment_text) AS number_of_comments
FROM user_comments
GROUP BY video_id)

,cte_user_engagement AS (
SELECT
v.user_id,
COALESCE(ca.number_of_likes,0) AS number_of_likes,
COALESCE(ca.number_of_favorites,0) AS number_of_favorites,
COALESCE(ca.number_of_shares,0) AS number_of_shares,
COALESCE(ca.number_of_dislikes,0) AS number_of_dislikes,
COALESCE(cc.number_of_comments,0) AS number_of_comments
FROM video as v
LEFT JOIN cte_count_activity as ca ON v.id=ca.video_id
LEFT JOIN cte_count_comments as cc ON v.id=cc.video_id)

SELECT DISTINCT
u.username,
SUM(ue.number_of_likes) OVER(PARTITION BY ue.user_id)
AS likes,
SUM(ue.number_of_favorites) OVER(PARTITION BY ue.user_id)
AS favorites,
SUM(ue.number_of_shares) OVER(PARTITION BY ue.user_id)
AS shares,
SUM(ue.number_of_dislikes) OVER(PARTITION BY ue.user_id)
AS dislikes,
SUM(ue.number_of_comments) OVER(PARTITION BY ue.user_id)
AS comments,
(SUM(ue.number_of_likes) OVER(PARTITION BY ue.user_id)+
SUM(ue.number_of_favorites) OVER(PARTITION BY ue.user_id)+
SUM(ue.number_of_shares) OVER(PARTITION BY ue.user_id)+
SUM(ue.number_of_comments) OVER(PARTITION BY ue.user_id))-
SUM(ue.number_of_dislikes) OVER(PARTITION BY ue.user_id)
AS user_engagement_points
FROM cte_user_engagement AS ue
LEFT JOIN users as U ON ue.user_id=u.id
ORDER BY user_engagement_points DESC

Result :

Users with highest engagement points

Case 3. User Distribution by Country, Age, and Gender

This type of distribution analysis helps you understand where your users are from, their age demographics, and gender representation, revealing important trends and patterns within the user population.

Query :

SELECT 
region,
gender,
CASE
WHEN EXTRACT(YEAR FROM AGE(date_of_birth))
BETWEEN 13 AND 17 THEN '13-17'
WHEN EXTRACT(YEAR FROM AGE(date_of_birth))
BETWEEN 18 AND 24 THEN '18-24'
WHEN EXTRACT(YEAR FROM AGE(date_of_birth))
BETWEEN 25 AND 34 THEN '25-34'
WHEN EXTRACT(YEAR FROM AGE(date_of_birth))
BETWEEN 35 AND 44 THEN '35-44'
ELSE '45+'
END AS age_group,
COUNT(id) AS user_count
FROM users
GROUP BY region, gender, age_group
ORDER BY region, gender, age_group

Result :

User distribution by country, gender and age

Content Performance Analysis

Case 1. Top 5 videos with highest engagement points
Engagement points are calculated based on various user actions, such as likes, comments, shares, favorites, and other interactive behaviors that show user interest.

Query :

WITH cte_activity AS (
SELECT
user_id,
video_id,
is_like,
is_favorite,
is_share,
is_dislike,
activity_date,
ROW_NUMBER() OVER(PARTITION BY user_id, video_id
ORDER BY activity_date DESC)
AS latest_row
FROM user_activity)

,cte_count_activity AS (
SELECT
video_id,
COUNT(CASE WHEN is_like = 1 THEN is_like END)
AS number_of_likes,
COUNT(CASE WHEN is_favorite = 1 THEN is_favorite END)
AS number_of_favorites,
COUNT(CASE WHEN is_share = 1 THEN is_share END)
AS number_of_shares,
COUNT(CASE WHEN is_dislike = 1 THEN is_dislike END)
AS number_of_dislikes
FROM cte_activity
WHERE latest_row=1
GROUP BY video_id)

,cte_count_comments AS (
SELECT video_id,
COUNT(comment_text) AS number_of_comments
FROM user_comments
GROUP BY video_id)

SELECT
v.id AS video_id,
v.caption AS video_caption,
u.username AS author_name,
COALESCE(ca.number_of_likes,0) AS number_of_likes,
COALESCE(ca.number_of_favorites,0) AS number_of_favorites,
COALESCE(ca.number_of_shares,0) AS number_of_shares,
COALESCE(ca.number_of_dislikes,0) AS number_of_dislikes,
COALESCE(cc.number_of_comments,0) AS number_of_comments,
(COALESCE(ca.number_of_likes,0)+
COALESCE(ca.number_of_favorites,0)+
COALESCE(ca.number_of_shares,0)+
COALESCE(cc.number_of_comments,0))-
COALESCE(ca.number_of_dislikes,0)
AS engagement_points
FROM video AS v
LEFT JOIN cte_count_activity AS ca
ON v.id = ca.video_id
LEFT JOIN cte_count_comments AS cc
ON v.id = cc.video_id
LEFT JOIN users AS u
ON v.user_id = u.id
ORDER BY engagement_points DESC
LIMIT 5

Result :

Videos with Highest Engagement Points

Case 2. Hashtags Usage Trends and How it Affects Video Performance

Examines how frequently certain hashtags are used and evaluates their impact on key video performance metrics like views, likes, favorites, shares, comments, and overall engagement.

Query :

WITH hashtag_split AS (
SELECT id AS video_id,
TRIM(UNNEST(string_to_array(hashtags, ','))) AS hashtag
FROM video
WHERE hashtags IS NOT NULL)

,cte_activity AS (
SELECT
id,
user_id,
video_id,
is_like,
is_favorite,
is_share,
is_dislike,
activity_date,
ROW_NUMBER() OVER(PARTITION BY user_id, video_id
ORDER BY activity_date DESC)
AS latest_row
FROM user_activity)

,cte_latest_activity AS (
SELECT *
FROM cte_activity
WHERE latest_row=1)

SELECT
hs.hashtag,
COUNT(DISTINCT vv.id) AS views,
COUNT(DISTINCT CASE WHEN la.is_like=1 THEN la.id END) likes,
COUNT(DISTINCT CASE WHEN la.is_favorite=1 THEN la.id END) favorites,
COUNT(DISTINCT CASE WHEN la.is_share=1 THEN la.id END) shares,
COUNT(DISTINCT uc.id) AS comments,
ROUND(((COUNT(DISTINCT CASE WHEN la.is_like=1 THEN la.id END)+
COUNT(DISTINCT CASE WHEN la.is_favorite=1 THEN la.id END)+
COUNT(DISTINCT CASE WHEN la.is_share=1 THEN la.id END)+
COUNT(DISTINCT uc.id))::numeric/
COUNT(DISTINCT vv.id))*100,0)
AS engagement_points
FROM hashtag_split AS hs
LEFT JOIN video_views AS vv ON hs.video_id=vv.video_id
LEFT JOIN cte_latest_activity AS la ON hs.video_id=la.video_id
LEFT JOIN user_comments AS uc ON hs.video_id=uc.video_id
GROUP BY hs.hashtag
ORDER BY engagement_points DESC

Result :

Hashtags Usage Trends

Engagement & Influence Rate Analysis

Case 1. Users with Highest Engagement Rate
This metric provides insight into which users create the most compelling content that resonates with their audience.

Engagement Rate : (Engagement Points / Total Views) * 100%

Query :

WITH cte_activity AS (
SELECT
user_id,
video_id,
is_like,
is_favorite,
is_share,
is_dislike,
activity_date,
ROW_NUMBER() OVER(PARTITION BY user_id, video_id
ORDER BY activity_date DESC)
AS latest_row
FROM user_activity)

,cte_count_activity AS (
SELECT video_id,
COUNT(CASE WHEN is_like = 1 THEN is_like END)
AS number_of_likes,
COUNT(CASE WHEN is_favorite = 1 THEN is_favorite END)
AS number_of_favorites,
COUNT(CASE WHEN is_share = 1 THEN is_share END)
AS number_of_shares,
COUNT(CASE WHEN is_dislike = 1 THEN is_dislike END)
AS number_of_dislikes
FROM cte_activity
WHERE latest_row=1
GROUP BY video_id)

,cte_count_comments AS (
SELECT video_id,
COUNT(comment_text) AS number_of_comments
FROM user_comments
GROUP BY video_id)

,cte_user_engagement AS (
SELECT
v.user_id,
COALESCE(ca.number_of_likes,0) AS number_of_likes,
COALESCE(ca.number_of_favorites,0) AS number_of_favorites,
COALESCE(ca.number_of_shares,0) AS number_of_shares,
COALESCE(ca.number_of_dislikes,0) AS number_of_dislikes,
COALESCE(cc.number_of_comments,0) AS number_of_comments
FROM video as v
LEFT JOIN cte_count_activity AS ca ON v.id=ca.video_id
LEFT JOIN cte_count_comments AS cc ON v.id=cc.video_id)

,cte_user_points AS (
SELECT DISTINCT
user_id,
SUM(number_of_likes) OVER(PARTITION BY user_id)
AS num_of_likes,
SUM(number_of_favorites) OVER(PARTITION BY user_id)
AS num_of_favorites,
SUM(number_of_shares) OVER(PARTITION BY user_id)
AS num_of_shares,
SUM(number_of_dislikes) OVER(PARTITION BY user_id)
AS num_of_dislikes,
SUM(number_of_comments) OVER(PARTITION BY user_id)
AS num_of_comments,
(SUM(number_of_likes) OVER(PARTITION BY user_id)+
SUM(number_of_favorites) OVER(PARTITION BY user_id)+
SUM(number_of_shares) OVER(PARTITION BY user_id)+
SUM(number_of_comments) OVER(PARTITION BY user_id))-
SUM(number_of_dislikes) OVER(PARTITION BY user_id)
AS user_engagement_points
FROM cte_user_engagement
ORDER BY user_engagement_points DESC)

,cte_count_views AS (
SELECT video_id,
COUNT(user_id) AS total_views
FROM video_views
GROUP BY video_id)

,cte_viewership AS (
SELECT DISTINCT
v.user_id,
SUM(total_views) OVER(PARTITION BY user_id) as viewership
FROM cte_count_views AS cv
LEFT JOIN video AS v ON cv.video_id = v.id)

SELECT u.id AS user_id,
u.username,
COALESCE(up.user_engagement_points,0)
AS user_engagement_points,cv.viewership,
ROUND((COALESCE(up.user_engagement_points,0)/
cv.viewership)*100,0) AS user_engagement_rate
FROM users AS u
LEFT JOIN cte_user_points AS up ON u.id=up.user_id
LEFT JOIN cte_viewership AS cv ON u.id=cv.user_id
WHERE user_engagement_points > 0 AND viewership > 0
ORDER BY user_engagement_rate DESC

Result :

Users with Highest Engagement Rate

Case 2. Users with Highest Influence Rate

Identifies users who have a strong impact on audience behavior. These users effectively influence their audience to take meaningful actions, making them valuable for growth or marketing strategies.

Influence Rate : (Engagement Points / Followers) * 100%

Query :

WITH cte_activity AS (
SELECT
user_id,
video_id,
is_like,
is_favorite,
is_share,
is_dislike,
activity_date,
ROW_NUMBER() OVER(PARTITION BY user_id, video_id
ORDER BY activity_date DESC) AS latest_row
FROM user_activity)

,cte_count_activity AS (

SELECT
video_id,
COUNT(CASE WHEN is_like = 1 THEN is_like END)
AS number_of_likes,
COUNT(CASE WHEN is_favorite = 1 THEN is_favorite END)
AS number_of_favorites,
COUNT(CASE WHEN is_share = 1 THEN is_share END)
AS number_of_shares,
COUNT(CASE WHEN is_dislike = 1 THEN is_dislike END)
AS number_of_dislikes
FROM cte_activity
WHERE latest_row=1
GROUP BY video_id)

,cte_count_comments AS (
SELECT video_id,
COUNT(comment_text) AS number_of_comments
FROM user_comments
GROUP BY video_id)

,cte_user_engagement AS (
SELECT
v.user_id,
COALESCE(ca.number_of_likes,0) AS number_of_likes,
COALESCE(ca.number_of_favorites,0) AS number_of_favorites,
COALESCE(ca.number_of_shares,0) AS number_of_shares,
COALESCE(ca.number_of_dislikes,0) AS number_of_dislikes,
COALESCE(cc.number_of_comments,0) AS number_of_comments
FROM video AS v
LEFT JOIN cte_count_activity AS ca ON v.id=ca.video_id
LEFT JOIN cte_count_comments AS cc ON v.id=cc.video_id)

,cte_user_points AS (
SELECT DISTINCT
user_id,
SUM(number_of_likes) OVER(PARTITION BY user_id) AS num_of_likes,
SUM(number_of_favorites) OVER(PARTITION BY user_id) AS num_of_favorites,
SUM(number_of_shares) OVER(PARTITION BY user_id) AS num_of_shares,
SUM(number_of_dislikes) OVER(PARTITION BY user_id) AS num_of_dislikes,
SUM(number_of_comments) OVER(PARTITION BY user_id) AS num_of_comments,
(SUM(number_of_likes) OVER(PARTITION BY user_id)+
SUM(number_of_favorites) OVER(PARTITION BY user_id)+
SUM(number_of_shares) OVER(PARTITION BY user_id)+
SUM(number_of_comments) OVER(PARTITION BY user_id))-
SUM(number_of_dislikes) OVER(PARTITION BY user_id)
AS user_engagement_points
FROM cte_user_engagement
ORDER BY user_engagement_points DESC)

,cte_last_following AS (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY follower_id, followee_id
ORDER BY following_date DESC)
AS latest_row,
EXTRACT(month from following_date) as following_month
FROM user_following)

,cte_followers AS (
SELECT followee_id as user_id,
COUNT(CASE WHEN is_follow=1 THEN is_follow END) AS followers
FROM cte_last_following
WHERE latest_row=1
GROUP BY followee_id)

SELECT
u.id,
u.username,
COALESCE(up.user_engagement_points,0) AS user_engagement_points,
COALESCE(cf.followers,0) AS followers,
ROUND((COALESCE(up.user_engagement_points,0)/
COALESCE(cf.followers,0))*100,0) AS user_influence_rate
FROM users AS u
LEFT JOIN cte_user_points AS up ON u.id=up.user_id
LEFT JOIN cte_followers AS cf ON u.id=cf.user_id
WHERE up.user_engagement_points > 0 AND cf.followers > 0
ORDER BY user_influence_rate DESC

Result :

Users with Highest Influence Rate

Conclusion & Insights

Content Strategy Improvement: Focus on creating content that generates high engagement points by leveraging the most effective hashtags and understanding what types of posts receive the most likes and shares.

Influencer Collaboration: Engage with influencers who have a high influence rate to maximize reach and engagement for promotional content.

--

--

Budi Wijaya Putra
Budi Wijaya Putra

Written by Budi Wijaya Putra

0 Followers

Hi, I'm Budi Wijaya Putra. I'm constantly learning and growing in this field. Take a look at my journey: bit.ly/budiwijayaputra

No responses yet