import sqlite_utils
import pandas as pd

# Time series data -- AQI Bulletins data of all Indian Cities
time_series_df = pd.read_csv("C:/Users/dskcy/UEInfo/AQI_bulletins/data/Processed/AllIndiaBulletins_Master_2024.csv")

#clean
time_series_df = time_series_df[['date', 'City', 'No. Stations', 'Air Quality', 'Index Value', 'Prominent Pollutant']]
time_series_df.columns=['date', 'city', 'no_stations', 'aqi_category','aqi','prominent_pollutant']
time_series_df = time_series_df.replace(to_replace='Very poor', value='Very Poor')
# Replace "OZONE" with "O3" in the 'prominent_pollutant' column
time_series_df['prominent_pollutant'] = time_series_df['prominent_pollutant'].astype(str).str.replace("OZONE", "O3", regex=False)
time_series_df['date'] = pd.to_datetime(time_series_df['date'])

time_series_df['year'] = time_series_df['date'].dt.year
time_series_df['month'] = time_series_df['date'].dt.strftime('%b')

duplicates = time_series_df[time_series_df.duplicated(subset=['date', 'city'], keep=False)]
duplicates

db = sqlite_utils.Database("C:/Users/dskcy/UEInfo/AQI_bulletins/data/Processed/aqi-bulletins.db")
# this creates or connects to an SQLite file

# Add a table in the db and insert all rows from csv in it
table_name = "aqi_bulletins"
db[table_name].insert_all(
                            time_series_df.drop_duplicates(subset=["date", "city"]).to_dict(orient="records"),
                            alter=True,
                            pk=("date", "city") # Primary Key \\ Don't auto-create row_id...
                         )

<Table aqi_bulletins (date, city, no_stations, aqi_category, aqi, prominent_pollutant, year, month)>

#Date in YYYY-MM-DD format
db.execute("""
    UPDATE aqi_bulletins
    SET date = substr(date, 1, 10)
""")
db.conn.commit()

# To avoid facet timeout errors, create index on columns on which we want facets
db.execute("CREATE INDEX IF NOT EXISTS idx_city ON aqi_bulletins(city)")
db.execute("CREATE INDEX IF NOT EXISTS idx_aqicategory ON aqi_bulletins(aqi_category)")
db.conn.commit()

#See all tables in db
db.tables

[<Table aqi_bulletins (date, city, no_stations, aqi_category, aqi, prominent_pollutant, year, month)>]

for table_name in db.table_names():
    row_count = db[table_name].count
    print(f"{table_name}: {row_count} rows")

aqi_bulletins: 387686 rows

# Inspect schema
print(db["aqi_bulletins"].schema)

CREATE TABLE [aqi_bulletins] (
   [date] TEXT,
   [city] TEXT,
   [no_stations] INTEGER,
   [aqi_category] TEXT,
   [aqi] INTEGER,
   [prominent_pollutant] TEXT,
   [year] INTEGER,
   [month] TEXT,
   PRIMARY KEY ([date], [city])
)

# Show first 5 rows in the table
list(db[table_name].rows)[:5]

[{'date': '2015-05-01',
  'city': 'Varanasi',
  'no_stations': 0,
  'aqi_category': 'Moderate',
  'aqi': 157,
  'prominent_pollutant': 'PM10',
  'year': 2015,
  'month': 'May'},
 {'date': '2015-05-01',
  'city': 'Kanpur',
  'no_stations': 0,
  'aqi_category': 'Moderate',
  'aqi': 175,
  'prominent_pollutant': 'PM10',
  'year': 2015,
  'month': 'May'},
 {'date': '2015-05-01',
  'city': 'Faridabad',
  'no_stations': 0,
  'aqi_category': 'Moderate',
  'aqi': 173,
  'prominent_pollutant': 'PM10',
  'year': 2015,
  'month': 'May'},
 {'date': '2015-05-01',
  'city': 'Ahmedabad',
  'no_stations': 0,
  'aqi_category': 'Moderate',
  'aqi': 168,
  'prominent_pollutant': 'PM2.5',
  'year': 2015,
  'month': 'May'},
 {'date': '2015-05-01',
  'city': 'Hyderabad',
  'no_stations': 0,
  'aqi_category': 'Moderate',
  'aqi': 189,
  'prominent_pollutant': 'PM2.5',
  'year': 2015,
  'month': 'May'}]

# Run SQL queries
## Max AQI in each month in Delhi
results = db.execute("""
    SELECT
        substr(date, 1, 7) AS month,
        MAX(aqi) AS max_aqi
    FROM
        aqi_bulletins
    WHERE
        city = 'Delhi'
    GROUP BY
        month
    ORDER BY
        month
""")
list(results)

[('2015-08', 246),
 ('2015-09', 302),
 ('2016-03', 352),
 ('2016-10', 445),
 ('2016-11', 497),
 ('2017-04', 383),
 ('2017-05', 368),
 ('2017-12', 469),
 ('2018-06', 447),
 ('2019-02', 382),
 ('2019-07', 317),
 ('2021-01', 460)]

db.conn.close()

import json

metadata = {
    "title": "Air Quality Index (AQI) Bulletins",
  "description": "This dataset contains daily Air quality bulletins released by CPCB from 2015-24",
  "license": "GODL",
  "license_url": "https://www.data.gov.in/Godl",
  "source": "Central Pollution Control Board (CPCB)",
  "source_url": "https://cpcb.nic.in/AQI_Bulletin.php",
    "about": "UrbanEmissions.info",
    "about_url": "https://github.com/urbanemissionsinfo/AQI_bulletins",
    "databases": {
        "aqi-bulletins": {
            "tables": {
                "aqi_bulletins": {
                    "facets": ["city","aqi_category","year","month"]  # specify what you want
                }
            }
        }
    }
}

with open("metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

#!pip install datasette
#!pip install datasette-vega #For charting options
#!datasette "C:/Users/dskcy/UEInfo/AQI_bulletins/data/Processed/aqi_bulletins.db" --port 8001

import subprocess
proc = subprocess.Popen(["datasette",
                         "C:/Users/dskcy/UEInfo/AQI_bulletins/data/Processed/aqi-bulletins.db",
                         "-m", "metadata.json",
                         "--config", "suggest_facets:off", #Don't show suggested facets
                         "--config", "default_facet_size:10", #Show only top 10 in facet
                         "--config", "facet_time_limit_ms:1000", #Increased facet query timeout limit to 1000ms from default 200ms.
                         "--port", "8001",
                        ])

# Kill process
proc.terminate()
proc.wait()  # wait for the process to cleanly exit

1

	date	city	no_stations	aqi_category	aqi	prominent_pollutant	year	month
179252	2022-03-31	Aurangabad	1	Moderate	104	PM10	2022	Mar
179253	2022-03-31	Aurangabad	1	Poor	268	PM10	2022	Mar
179301	2022-04-01	Aurangabad	1	Very Poor	312	PM10	2022	Apr
179302	2022-04-01	Aurangabad	1	Moderate	108	PM10	2022	Apr
179670	2022-04-03	Aurangabad	1	Very Poor	317	PM10	2022	Apr
...	...	...	...	...	...	...	...	...
212132	2022-11-01	Aurangabad	1	Moderate	117	PM2.5	2022	Nov
212193	2022-11-02	Aurangabad	1	Moderate	107	PM10	2022	Nov
212194	2022-11-02	Aurangabad	1	Moderate	144	PM10	2022	Nov
212418	2022-11-03	Aurangabad	1	Moderate	169	PM10	2022	Nov
212419	2022-11-03	Aurangabad	1	Moderate	128	PM2.5	2022	Nov

Publish AQI bulletins data on Datasette¶

Steps:¶

STEP 1: Import data into a SQLite Database¶

STEP 2: Explore the SQLite db created¶

STEP 3: Opening the SQLite db in Datasette¶

STEP 4: Deploy¶

Examples¶

Select all rows that have PM2.5¶

References¶