Experiment with requests
and feedparser
package to extract update on CO2 concentration from NOAA website
Exercise with web scraping, manipulate string and convert to Python object such as datetime, float and pack the data into dataframe. Finally, the data is plotted using matplotlib.pyplot
# plt_2_img(fig=fig)
requests
is a robust and flexible to get a web-page. Install it by:
pip install requests
rss_url = 'https://gml.noaa.gov/webdata/ccgg/trends/rss.xml'
xml
format which
attribute:value
structures.
rss
>
channel
> then list of
item
requests
package and extract the data inside
items
import requests
# request the webpage
resp = requests.get(rss_url)
# check status code
resp.status_code
# this is a long string
print(len(resp.text))
resp.text[:200]
to parse out the string, we will use
xml
library
from pprint import pprint as pp
pp(resp.text[:600])
import xml.etree.ElementTree as ET
root = ET.fromstring(resp.text)
for child in root:
print(child.tag, child.attrib)
type(root)
list(root)
print(list(root[0]))
# the item is here
items = root.findall('./channel/item')
len(items)
# the inner most element
print(list(items[0])) # print statement to make the output clear on HTML
print(list(items[0])[0].tag, list(items[0])[0].text)
print(list(items[0])[3].tag, list(items[0])[3].text)
print(list(items[0])[4].tag, list(items[0])[4].text)
pip
otherwise you will see an import error
!pip install feedparser
import feedparser
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-52681e28dc26> in <module>
----> 1 import feedparser
ModuleNotFoundError: No module named 'feedparser'
import feedparser
feed = feedparser.parse(rss_url)
# the feed is a dictionary-type rather a long string with request
feed.keys()
feed['bozo']
feed['updated']
feed['status']
# top element
feed['headers']
# equivalement to items
len(feed['entries'])
# value, ts
summary = ''
for k, v in feed['entries'][0].items():
# print(k, '-->', v)
if k=='title':
print(v)
if k=='summary':
print(v)
summary = v
# let see the line contain date again
title = feed['entries'][0].title
title
# we can split the string until the mark of datetime object like this
date = title.split('for')[-1].strip()
date
# then using datetime to convert a string object to datetime object
import datetime
datetime.datetime.strptime(date, '%B %d, %Y').date()
# or we can use dateutil to parse the date
from dateutil import parser
parser.parse(date).date()
entries = feed['entries']
entries[0]['summary']
entries[1]['summary']
entries[-1]['summary']
# first let deal with 0-7 element (total 8)
summary = entries[0].summary.splitlines()
summary
# now I realized that we don't even to extract date from title.
# It is availble in the second line in summary
# let split down to the string for date
summary[1].split(':')
date = summary[1].split(':')[0]
date
type(date)
try:
parser.parse(date)
except Exception as e:
print('Exception raise: ', e)
# parser cannot parse this form of string, we can improve by providing only a string of date
parser.parse(date.split('on')[-1]).date()
# or even better, through fuzzy=True argument
parser.parse(date, fuzzy=True).date()
conc = summary[1].split(':')[-1]
conc
conc.split('ppm')[0].strip()
summary
summary = entries[1].summary
lines = summary.splitlines()[1:]
lines
lines = [line.strip().split(':') for line in lines]
lines
lines[0][0]
date = parser.parse(lines[0][0], fuzzy=True).date()
date
concs = [line[-1].split('ppm') for line in lines]
concs
concs = [conc[0].strip() for conc in concs]
# let design a function to extract data
def extract_co2_conc(entry):
'''desc is formatted in paragraph.'''
# break up paragraph by \n mark and drop 1st line
lines = entry.summary.splitlines()[1:]
# break up each line by semicolon
lines = [line.strip().split(':') for line in lines]
date = parser.parse(lines[0][0], fuzzy=True).date()
# get concentration
concs = [line[-1].split('ppm') for line in lines]
# concs = [line[-1].strip() for line in lines]
concs = [conc[0].strip() for conc in concs]
return {date: concs}
extract_co2_conc(entries[0])
co2_data = dict()
for line in entries[:-1]:
item = extract_co2_conc(line)
co2_data.update(item)
co2_data
# create a dateframe
import pandas as pd
df = pd.DataFrame(data=co2_data, dtype=float)
df
# transpose table
df = df.transpose()
df
this_year = df.index[0].year
this_year
# 1 year ago, and 10 years ago
cols = [this_year, this_year - 1, this_year - 10]
df.columns = cols
df.head()
import matplotlib.pyplot as plt
bg_color='#F5F4EF'
list(feed)
feed.headers
feed.updated
title = feed.feed['subtitle']
update = parser.parse(feed.feed['updated']).date().strftime('%B %d, %Y')
plt.rcParams['font.family'] = 'monospace'
plt.rcParams['font.size'] = 12
import matplotlib as mpl
dict(zip(df.iloc[1].index, df.iloc[1].values))
x = df.iloc[0].name
Y = list(df.iloc[0].values)
Y
bbox = dict(boxstyle="round,pad=0.3", fc=bg_color, ec='k', alpha=0.3)
fig, ax = plt.subplots(figsize=(10,6), facecolor=bg_color)
colors = ['firebrick', 'maroon', 'black']
for i, col in enumerate(cols):
ax.plot(df[col], marker='o', linewidth=0.5,
markersize=10, markerfacecolor=bg_color,
markeredgewidth=2,
color=colors[i], label=col)
ax.annotate(Y[i],
xy=(x,Y[i]+1), xycoords='data',
va='center',
ha='right',
color=colors[i],
bbox=bbox)
# ax.set_ylim(350, 450)
ax.set_facecolor(bg_color)
fig.suptitle(title);
ax.set_title(f'updated: {update}', fontsize=12)
ax.tick_params(axis='both', direction='in', length=8)
ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%B %d'))
ax.set_ylabel('parts per million, ppm')
ax.legend()
fig.tight_layout()
import io
import base64
from IPython.core.display import HTML
img = io.BytesIO()
fig.savefig(img, format='png', bbox_inches="tight")
# similar to function above
def plt_2_img(fig=None):
'''convert image to bytes and display on jupyter'''
img = io.BytesIO()
fig.savefig(img, format='png', bbox_inches="tight")
encoded_string = base64.b64encode(img.getvalue()).decode("utf-8").replace("\n", "")
img = f'data:image/png;base64,{encoded_string}'
return HTML(f"
{img}>") # check the Jupyter Notebook, HTML converter remove img tag
plt_2_img(fig=fig)