CREATE TABLE trip_data (
ride_id VARCHAR(255),
rideable_type VARCHAR(255),
started_at TIMESTAMP,
ended_at TIMESTAMP,
start_station_name VARCHAR(255),
start_station_id VARCHAR(255),
end_station_name VARCHAR(255),
end_station_id VARCHAR(255),
start_lat FLOAT,
start_lng FLOAT,
end_lat FLOAT,
end_lng FLOAT,
member_casual VARCHAR(50)
);
SELECT rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual,
COUNT(*) AS duplicate_count
FROM trip_data
GROUP BY rideable_type,
started_at,
ended_at,
start_station_name,
start_station_id,
end_station_name,
end_station_id,
start_lat,
start_lng,
end_lat,
end_lng,
member_casual
HAVING COUNT(*) > 1;
WITH CTE AS (
SELECT ctid,
ROW_NUMBER() OVER (PARTITION BY rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual ORDER BY ctid) AS rn
FROM trip_data
)
DELETE FROM trip_data
WHERE ctid IN (
SELECT ctid
FROM CTE
WHERE rn > 1
);
SELECT
'rideable_type' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE rideable_type IS NULL
UNION ALL
SELECT
'started_at' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE started_at IS NULL
UNION ALL
SELECT
'ended_at' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE ended_at IS NULL
UNION ALL
SELECT
'start_station_name' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE start_station_name IS NULL
UNION ALL
SELECT
'start_station_id' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE start_station_id IS NULL
UNION ALL
SELECT
'end_station_name' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE end_station_name IS NULL
UNION ALL
SELECT
'end_station_id' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE end_station_id IS NULL
UNION ALL
SELECT
'start_lat' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE start_lat IS NULL
UNION ALL
SELECT
'start_lng' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE start_lng IS NULL
UNION ALL
SELECT
'end_lat' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE end_lat IS NULL
UNION ALL
SELECT
'end_lng' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE end_lng IS NULL
UNION ALL
SELECT
'member_casual' AS column_name, COUNT(*) AS missing_count
FROM trip_data
WHERE member_casual IS NULL;
SELECT
DATE_TRUNC('month', started_at) AS month,
COUNT(*) FILTER (WHERE start_station_id IS NULL) AS missing_count,
COUNT(*) AS total_count,
ROUND((COUNT(*) FILTER (WHERE start_station_id IS NULL)::float / COUNT(*) * 100)::numeric, 2) AS percentage_missing
FROM
trip_data
GROUP BY
month
ORDER BY
month;
SELECT
DATE_TRUNC('month', started_at) AS month,
COUNT(*) FILTER (WHERE end_station_id IS NULL) AS missing_count,
COUNT(*) AS total_count,
ROUND((COUNT(*) FILTER (WHERE end_station_id IS NULL)::float / COUNT(*) * 100)::numeric, 2) AS percentage_missing
FROM
trip_data
GROUP BY
month
ORDER BY
month;
DELETE FROM trip_data
WHERE
start_station_id IS NULL
OR end_station_id IS NULL
OR start_station_name IS NULL
OR end_station_name IS NULL
OR start_lat IS NULL
OR start_lng IS NULL
OR end_lat IS NULL
OR end_lng IS NULL
OR rideable_type IS NULL
OR member_casual IS NULL
OR started_at IS NULL
OR ended_at IS NULL;
SELECT
column_name,
data_type
FROM
information_schema.columns
WHERE
table_name = 'trip_data';
SELECT
COUNT(*) AS inconsistent_entries
FROM
trip_data
WHERE
ended_at < started_at;
DELETE FROM trip_data
WHERE ended_at < started_at;
SELECT DISTINCT rideable_type
FROM trip_data;
SELECT DISTINCT start_station_name FROM trip_data;
SELECT DISTINCT end_station_name FROM trip_data;
-- 1. Convert to Title Case
UPDATE trip_data
SET start_station_name = INITCAP(start_station_name),
end_station_name = INITCAP(end_station_name);
-- 2. Replace Abbreviations
UPDATE trip_data
SET start_station_name = REPLACE(start_station_name, 'St.', 'Street'),
end_station_name = REPLACE(end_station_name, 'St.', 'Street');
-- 3. Trim Extra Spaces
UPDATE trip_data
SET start_station_name = TRIM(BOTH FROM start_station_name),
end_station_name = TRIM(BOTH FROM end_station_name);
-- 4. Remove Duplicate Spaces
UPDATE trip_data
SET start_station_name = REGEXP_REPLACE(start_station_name, '\s+', ' '),
end_station_name = REGEXP_REPLACE(end_station_name, '\s+', ' ');
-- Identifying Unusually Short Durations (eg. < 60 sec)
SELECT *
FROM trip_data
WHERE EXTRACT(EPOCH FROM (ended_at - started_at)) < 60;
-- Identifying Unusually Long Durations (eg. > 1 day)
SELECT *
FROM trip_data
WHERE EXTRACT(EPOCH FROM (ended_at - started_at)) > 86400;
-- Identify Unusually Long Distance Rides (eg. > 20 miles)
WITH distances AS (
SELECT *,
(3959 * acos(
LEAST(1.0, GREATEST(-1.0,
cos(radians(start_lat)) * cos(radians(end_lat)) *
cos(radians(end_lng) - radians(start_lng)) +
sin(radians(start_lat)) * sin(radians(end_lat))
))
)) AS distance_miles
FROM trip_data
)
SELECT *
FROM distances
WHERE distance_miles > 20;