CO2 conc. vs height

Feb. 13, 2022

analyze high-resolution of CO2 hourly concentration provided by NOAA from selected stations in the U.S.

prove a trivial thesis that CO2 concentration are reducing as the intake height increases

produce a high-quality graph like the one below.


Library

In [1]:
import warnings
warnings.filterwarnings('ignore') # make the output cleaner
In [2]:
import re
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
In [3]:
plt.style.use('default')
plt.rcParams["figure.figsize"] = (8,6)

Extract hourly CO2

Download and extract zip package

In [4]:
# create a folder to contain hourly data
try:
    os.makedirs('hourly')
except OSError as e:
    print(e)
    
[Errno 17] File exists: 'hourly'
In [5]:
# if data files are not yet available, run this and the cell below to get them
# !curl https://gml.noaa.gov/aftp/data/trace_gases/co2/in-situ/tower/co2_tower-insitu_1_ccgg_ASCIItext.tar.gz --output hourly/co2.tar.gz
In [6]:
# !tar -xzf ./hourly/co2.tar.gz -C ./hourly/
In [7]:
# if you see .gz and folder of co2_tower, you are the right track
!ls ./hourly/
co2.tar.gz  co2_tower-insitu_1_ccgg_ASCIItext
In [8]:
# pipe list of file to a file to read through each file
!ls ./hourly/co2_tower-insitu_1_ccgg_ASCIItext/ > textfile.txt
In [9]:
with open('textfile.txt') as f:
    lines = f.readlines()
lines
Out[9]:
['co2_amt_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_bao_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_crv_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_lef_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_mbo_surface-insitu_1_ccgg_HourlyData.txt\n',
 'co2_sct_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_snp_surface-insitu_1_ccgg_HourlyData.txt\n',
 'co2_wbi_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_wgc_tower-insitu_1_ccgg_HourlyData.txt\n',
 'co2_wkt_tower-insitu_1_ccgg_HourlyData.txt\n',
 'README_tower_insitu_co2.html\n']

Metadata

In [10]:
def extract_global_attributes(fpath): 
    '''capture the header of global attributes
    return a list of lines
    '''
    
    with open(fpath) as f:
        head = [next(f) for x in range(200)]
        head = [line for line in head if line.startswith('#')]

        # extract GLOBAL VARIABLE
        for i, line in enumerate(head):
            if 'DESCRIPTION' in line:
                start_i = i
            if 'VARIABLE ATTRIBUTES' in line:
                end_i = i
        head = head[start_i: end_i]
    return head
In [11]:
# some attr has an extra space
attrs  = ['site_code', 'site_country ', 'site_name', 
          'site_latitude', 'site_longitude', 'site_elevation ', 
          'site_utc2lst ', 'dataset_description', 'dataset_citation'] 
In [12]:
def extract_metadata(fpath):
    head = extract_global_attributes(fpath)
    meta = dict()
    
    for attr in attrs:
        for line in head:
            if attr in line:
                meta[attr] = line.split(':')[-1].strip()
    return meta
In [13]:
# return all metadata in the stations
site_meta = list()
for file in lines:
    if file.startswith('co2'):
        fpath = f'./hourly/co2_tower-insitu_1_ccgg_ASCIItext/{file.strip()}'
        print(fpath)
        site_meta.append(extract_metadata(fpath))
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_amt_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_bao_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_crv_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_lef_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_mbo_surface-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_sct_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_snp_surface-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_wbi_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_wgc_tower-insitu_1_ccgg_HourlyData.txt
./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_wkt_tower-insitu_1_ccgg_HourlyData.txt
In [14]:
len(site_meta)
Out[14]:
10
In [15]:
site_meta[0]
Out[15]:
{'site_code': 'AMT',
 'site_country ': 'United States',
 'site_name': 'Argyle, Maine',
 'site_latitude': '45.0345',
 'site_longitude': '-68.6821',
 'site_elevation ': '53.0',
 'site_utc2lst ': '-5.0',
 'dataset_description': 'Atmospheric Carbon Dioxide Dry Air Mole Fractions from quasi-continuous measurements at Argyle, Maine.',
 'dataset_citation': '//dx.doi.org/10.7289/V57W69F2'}

Observatory station map

In [16]:
# visualize location of stations
import folium
from folium.features import DivIcon
In [17]:
def add_marker(m, station):
#     print(station)
    try:
        if station is not None:
            location=[station['site_latitude'],station['site_longitude']]
            tooltip = f"{station['site_name']}
Elevation, m
{station['site_elevation ']}" folium.Circle(location=location, fill=True, radius = 50_000, color='maroon', tooltip=tooltip, ).add_to(m) folium.map.Marker( location=location, icon=DivIcon( icon_size=(150,36), icon_anchor=(0,0), html='%s' % station['site_code'], ) ).add_to(m) return m except Exception as e: print(e) print(station) return None
In [18]:
m = folium.Map(location=[45.5236, -122.6750])
for station in site_meta:
    if station is not None: 
        location=[float(station['site_latitude']), float(station['site_longitude'])]
        add_marker(m, station)
In [19]:
# locate the boundary of the map
south = min([station['site_latitude'] for station in site_meta])
west = min([station['site_longitude'] for station in site_meta])
north = max([station['site_latitude'] for station in site_meta])
east = max([station['site_longitude'] for station in site_meta])
south, west, north, east
Out[19]:
('31.3149', '-105.004', '64.9863', '-97.3269')
In [20]:
m.fit_bounds([[south, west], [north, east]])
m
Out[20]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [21]:
# we could save the map into html and display on anothe webpage
m.save('co2_stations.html')
In [22]:
%%html
<iframe
  src="co2_stations.html"
  width="800"
  height="600"
  title="chart name"
  style="border:none">
iframe>"
"
In [23]:
# or save file as a static image
import io
import base64
from IPython.core.display import HTML
img = io.BytesIO()
In [24]:
#using selenium to render a webpage require geckodriver
# download from https://github.com/mozilla/geckodriver/releases
# extract the bin package to your local virtual environment or any path below
# !echo $PATH
In [25]:
img = m._to_png(3)
In [26]:
encoded_string = base64.b64encode(img).decode("utf-8").replace("\n", "")
img = HTML(f'{encoded_string}>')
In [27]:
img
Out[27]:

CO 2 at different height

  • first let see the ditribution of CO2 with height. We take granted that CO2 are heavier than the air, therefore, the CO2 are more concentrated to the ground level.
  • acquiring data to prove that are not easier because it requires a reliable set of CO2 reading at different height. Fortunately, this package includes some stations that measure CO2 with different height (up to 400m compared to the ground level)
  • we will make a toy analysis to prove that CO2 are more concentrated at lower level
In [28]:
fpath = f'./hourly/co2_tower-insitu_1_ccgg_ASCIItext/{lines[0].strip()}'
fpath
Out[28]:
'./hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_amt_tower-insitu_1_ccgg_HourlyData.txt'
In [29]:
# convert to datetime object
def make_dt(row):
    try:
        year = row['year']
        month =  row['month']
        day =  row['day']
        hour = row['hour']
        minute = row['minute']
        second = row['second']
        dt = datetime.datetime(int(year), int(month), int(day),
                               int(hour), int(minute), int(second))
        return dt
    except Exception as e:   
        pass
        print(f'Exception converting datetime: {e}')
        print(row)
    return None
In [30]:
def clean_co2_text(fpath):
    with open(fpath) as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        if line.startswith('site_code'):
#             print(line)
            print(i)
            break
    cols = lines[i].split(r'\s+')[-1].strip().split(' ')
    
    first_line = re.split(r'\s+', lines[i+1].strip())
    if len(first_line) != len(cols):
        print(lines[i+1])
        print(first_line)
        return None
    
    data = lines[i+1:]
    data = [re.split(r'\s+', line.strip()) for line in data]
    df = pd.DataFrame(data=data, columns=cols)
    df.dropna(inplace=True)
    df['time'] = df.apply(make_dt, axis=1)
    df.value = pd.to_numeric(df.value)
    df.intake_height = pd.to_numeric(df.intake_height)
    df = df[['time', 'value', 'intake_height']]
    df.set_index('time', inplace=True)
    return df
In [31]:
# select only text file. There is a readme file in HTML
# this is going to take a while, upto 2 mintues

lines = [line for line in lines if '.txt' in line]
for line in lines:
    fpath = f'./hourly/co2_tower-insitu_1_ccgg_ASCIItext/{line.strip()}'
    df = clean_co2_text(fpath)
    height = df.intake_height.unique()
    print(line, height)
153
co2_amt_tower-insitu_1_ccgg_HourlyData.txt
 [107.  12.  30.]
157
co2_bao_tower-insitu_1_ccgg_HourlyData.txt
 [300.  22. 100.]
198
co2_crv_tower-insitu_1_ccgg_HourlyData.txt
 [17.1  4.9 31.7]
180
co2_lef_tower-insitu_1_ccgg_HourlyData.txt
 [244. 396.  30.  11.  76. 122.]
152
co2_mbo_surface-insitu_1_ccgg_HourlyData.txt
 [11.3]
160
co2_sct_tower-insitu_1_ccgg_HourlyData.txt
 [305.  31.  61.]
159
co2_snp_surface-insitu_1_ccgg_HourlyData.txt
 [17.  5. 10.]
154
co2_wbi_tower-insitu_1_ccgg_HourlyData.txt
 [ 99.  31. 379.]
196
co2_wgc_tower-insitu_1_ccgg_HourlyData.txt
 [ 91.  30. 483.]
148
co2_wkt_tower-insitu_1_ccgg_HourlyData.txt
 [ 30.  61. 122. 457.   9. 244.]
In [32]:
# we will look more into WKT (TX) station which shows 6 height intakes
# this is a re-read, but we can use the df object from previous run since it is the latest df
fpath = './hourly/co2_tower-insitu_1_ccgg_ASCIItext/co2_wkt_tower-insitu_1_ccgg_HourlyData.txt'
df = clean_co2_text(fpath)
148
In [33]:
site_meta[-1]
Out[33]:
{'site_code': 'WKT',
 'site_country ': 'United States',
 'site_name': 'Moody, Texas',
 'site_latitude': '31.3149',
 'site_longitude': '-97.3269',
 'site_elevation ': '251.0',
 'site_utc2lst ': '-6.0',
 'dataset_description': 'Atmospheric Carbon Dioxide Dry Air Mole Fractions from quasi-continuous measurements at Moody, Texas.',
 'dataset_citation': '//dx.doi.org/10.7289/V57W69F2'}
In [34]:
# convert timestamp from GMT to local 
# should be automatic from site_meta
delta = datetime.timedelta(hours=-6)
In [35]:
df.index += delta
In [36]:
# was the intake fixed since the beggning or they was adjusted 
heights = list()
for year in df.index.year.unique():
    dft = df[df.index.year == year]
    heights.append({year:dft.intake_height.unique()})
In [37]:
# the measurement height changed by year
heights
Out[37]:
[{2001: array([ 30.,  61., 122., 457.,   9., 244.])},
 {2002: array([  9.,  30.,  61., 122., 244., 457.])},
 {2003: array([244., 457.,  30.,   9.,  61., 122.])},
 {2004: array([244., 457., 122.,  61.,  30.,   9.])},
 {2005: array([ 61.,   9.,  30., 244., 457., 122.])},
 {2006: array([457., 122.,  30.])},
 {2007: array([457., 122.,  30.])},
 {2008: array([457., 122.,  30.])},
 {2009: array([457., 122.,  30.])},
 {2010: array([457., 122.,  30.])},
 {2011: array([457., 122.,  30.])},
 {2012: array([457., 122.,  30.])},
 {2013: array([122.,  30., 457.])},
 {2014: array([457., 122.,  30.])},
 {2015: array([457., 122.,  30.])},
 {2016: array([457., 122.,  30.])},
 {2017: array([457., 122.,  30.])},
 {2018: array([457., 122.,  30.])},
 {2019: array([122.,  30.])},
 {2020: array([122.,  30.])}]
In [38]:
def average_co2(df, height):
    '''return a moving average of 24 points (1 day)'''
    
    _df = df.query('intake_height==@height')['value']
    _df = _df.rolling('1D').mean()
#     _df = _df.rolling(window=24, center=True).mean()
    return _df
In [39]:
# select one month of data
dft = df[(df.index.year == 2002) & (df.index.month==2)]
fig, ax = plt.subplots()
for height in sorted(dft.intake_height.unique()):
    ax.plot(average_co2(dft, height), label=height, alpha=0.9, lw=0.5, ls='--', marker='.', markersize=1)


ax.legend()
ax.set_title('CO2 concentration measured at Moody, TX at different heights')
fig.autofmt_xdate();
  • CO 2 is heavier than the dry air. Molecular weight of dry air is 29g/mole, and CO 2 is 44, so we should see a higher concentration of CO 2 near the ground and less as the height increases.
  • this graph shows a nice trend, but could we process it further to see the trend between concentration vs. height
  • the dataframe is a long table format, we need to pivot them to a wide table format. Each height occupied a seperate column
In [40]:
# pivotting table
dft2 = dft.pivot(columns='intake_height', values='value')
dft2.head()
Out[40]:
intake_height 9.0 30.0 61.0 122.0 244.0 457.0
time
2002-02-01 00:00:00 381.7 380.9 380.8 380.7 380.4 379.2
2002-02-01 01:00:00 381.7 380.9 380.7 380.5 380.2 378.8
2002-02-01 02:00:00 381.5 380.7 380.6 380.4 380.1 378.1
2002-02-01 03:00:00 381.9 381.1 380.9 380.8 380.6 378.2
2002-02-01 04:00:00 382.4 381.3 381.0 380.7 380.5 378.8
In [41]:
# let get relative ratio of CO2 from other height to the one nearest to the ground
cols = dft2.columns[1:]
cols
Out[41]:
Float64Index([30.0, 61.0, 122.0, 244.0, 457.0], dtype='float64', name='intake_height')
In [42]:
dfa = pd.DataFrame()
for col in cols:
    dfa[col] = dft2[col]/dft2[9.0]
In [43]:
dfa.head()
Out[43]:
30.0 61.0 122.0 244.0 457.0
time
2002-02-01 00:00:00 0.997904 0.997642 0.997380 0.996594 0.993450
2002-02-01 01:00:00 0.997904 0.997380 0.996856 0.996070 0.992402
2002-02-01 02:00:00 0.997903 0.997641 0.997117 0.996330 0.991088
2002-02-01 03:00:00 0.997905 0.997382 0.997120 0.996596 0.990312
2002-02-01 04:00:00 0.997123 0.996339 0.995554 0.995031 0.990586
In [44]:
dfa.describe()
Out[44]:
30.0 61.0 122.0 244.0 457.0
count 472.000000 472.000000 472.000000 472.000000 472.000000
mean 0.996578 0.994955 0.992844 0.990735 0.988063
std 0.003242 0.005236 0.007396 0.009834 0.010725
min 0.966516 0.963998 0.953540 0.949532 0.940621
25% 0.995826 0.992667 0.989766 0.985815 0.980438
50% 0.997655 0.996929 0.995582 0.994153 0.990320
75% 0.998425 0.998432 0.998406 0.998425 0.997648
max 1.000797 1.002785 1.005980 1.019608 1.010960
In [45]:
fig, ax = plt.subplots()
ax.set_title('Ratio of concentration of different height compared lowest level (9m)')
ax.boxplot(dfa, meanline=True, showmeans=True, labels=cols);
In [46]:
# we can make the graph more descriptive
means = dfa.describe().loc['mean'].values
In [47]:
# we can value next each box
for i, mean in enumerate(means):
    ax.text(
        i + 1.25,
        mean,
        str(round(mean, 4)).zfill(5),
        fontsize=11,
        va="center",
        bbox = dict(
            facecolor="white",
            edgecolor="grey",
            boxstyle="round",
            pad=0.15
        ),
        zorder=10 # to make sure the line is on top
    )
In [48]:
fig
Out[48]:
In [49]:
# that is expected, similar to a logarithmic relationship 
fig, ax = plt.subplots()
ax.plot(cols, means, ls='--', marker='o')
ax.set_title('Ratio of concentration of different height compared lowest level (9m)')
ax.set_ylabel('ratio to 9m level')
ax.set_xlabel('height, m')
Out[49]:
Text(0.5, 0, 'height, m')
In [50]:
# let see if we could extract more ratios from other months
def plot_subplot(year, month, ax, df=df):   
    dft = df[(df.index.year == year) & (df.index.month==month)]
    for height in sorted(dft.intake_height.unique()):
        ax.plot(average_co2(dft, height), label=height, alpha=0.9, lw=0.5, ls='--', marker='.', markersize=1)
    ax.spines["left"].set_color("none")
    ax.spines["bottom"].set_color("none")
    ax.spines["right"].set_color("none")
    ax.spines["top"].set_color("none")
    ax.set_xticklabels([])
    ax.set_xlabel(month)
    ax.set_xticks([])
    return None
In [51]:
dft = df[(df.index.year == 2002) & (df.index.month==11)]
_dft = dft.pivot(columns='intake_height', values='value')
In [52]:
months = np.array([i for i in range(1,13)]).reshape(4,3)
months
Out[52]:
array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])
In [53]:
for year in [2002, 2003, 2004, 2005]:  
    fig, axes = plt.subplots(nrows=4, ncols=3, sharey=True)
    fig.suptitle(year)
    for month in range(1, 13):
        row, col = np.argwhere(months==month)[0]
        plot_subplot(year, month, axes[row, col])