#!/usr/bin/env python # coding: utf-8 # # Exploring ABC Radio National metadata # # This notebook shows a few ways you can start to explore the ABC Radio National metadata harvested [using this notebook](harvest-abcrn.ipynb). # # For an earlier experiment playing with this data, see [In a word...: Currents in Australian affairs, 2003–2013](https://inaword.herokuapp.com/). # In[ ]: import re from pathlib import Path import altair as alt import nltk import pandas as pd from nltk.corpus import stopwords from wordcloud import WordCloud nltk.download("stopwords") # Load the harvested data. # In[2]: # Download the most recently harvested data file and convert to a dataframe df = pd.read_csv("https://github.com/GLAM-Workbench/trove-abcrn-data/raw/main/abcrn-metadata.csv") # How many records are there? # In[3]: df.shape[0] # ## Programs # # How many programs are there records for? # In[4]: df["isPartOf"].nunique() # Which programs have the most records? # In[5]: df["isPartOf"].value_counts()[:25] # ## Number of records by year # # To look at the number of records by year, we need to make sure the `date` field is being recognised as a `datetime`. Then we can extract the year into a new column. # In[6]: #df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce") df["year"] = df["date"].str.extract(r"^(\d{4})") # Find the number of times each year appears. # In[7]: year_counts = df["year"].value_counts().to_frame().reset_index() year_counts.columns = ["year", "count"] # Chart the results. # In[8]: alt.Chart(year_counts).mark_bar(size=15).encode( x="year:T", y="count:Q", tooltip=[alt.Tooltip("year:T", format="%Y"), alt.Tooltip("count:Q", format=",")] ).properties(width=600) # The early records look a bit suspect, and I should probably check them manually. I'm also wondering why there's been such a large decline in the number of records added since 2017. # ## People # # The `contributor` field includes the names of hosts, reporters, and guests. It's stored as a pipe-delimited string, so we have to split the string, then explode the resulting list to create one row per name. # In[9]: people = df["contributor"].str.split("|").explode().dropna() # Then we can calculate how often people appear in the records. # In[10]: people.value_counts()[:25] # In[11]: wc_people = WordCloud(width=1000, height=500).fit_words(people.value_counts().to_dict()) wc_people.to_image() # ## Titles # # There are three text fields that could yield some interesting analysis. The `title` field is obvious enough, though some regular segments do have duplicate titles. The `abstract` field is a brief summary of the segment or program. # # Let's try aggregating the titles for a program. # ### RN Breakfast in 2020 # In[12]: breakfast_titles = list( df.loc[ ( df["isPartOf"].isin( ["ABC Radio National. RN Breakfast", "ABC Radio. RN Breakfast"] ) ) & (df["year"] == "2020") ] .drop_duplicates(subset=["title"], keep=False)["title"] .unique() ) # In[13]: wordcloud = WordCloud( width=1000, height=500, stopwords=stopwords.words("english") + [ "Australia", "Australian", "Australians", "New", "News", "Matt", "Bevan", "World", ], collocations=False, ).generate(" ".join(breakfast_titles)) wordcloud.to_image() # ## RN Drive in 2020 # In[14]: drive_titles = list( df.loc[ (df["isPartOf"].isin(["ABC Radio National. RN Drive", "ABC Radio. RN Drive"])) & (df["year"] == "2020") ] .drop_duplicates(subset=["title"], keep=False)["title"] .unique() ) # In[15]: wordcloud = WordCloud( width=1000, height=500, stopwords=stopwords.words("english") + [ "Australia", "Australian", "Australians", "New", "News", "Matt", "Bevan", "World", ], collocations=False, ).generate(" ".join(drive_titles)) wordcloud.to_image() # ## Title words over time # # We can also look to see how often particular words or phrases appeared in the titles of RN segments or programs. This could of course be filtered by program, as above. # # ### Bushfires # In[16]: # Drop records without a title df_titles = df.dropna(subset=["title"]) # Find titles containing 'bushfire' bushfires = df_titles.loc[ df["title"].dropna().str.contains(r"bushfire", regex=True, flags=re.IGNORECASE) ] # In[17]: # Chart the results alt.Chart(bushfires).mark_line().encode( x="year(date):T", y="count()", tooltip=["year(date):T", "count():Q"] ) # ### Refugees # In[18]: refugees = df_titles.loc[ df["title"].dropna().str.contains(r"refugee", regex=True, flags=re.IGNORECASE) ] alt.Chart(refugees).mark_line().encode( x="year(date):T", y="count():Q", tooltip=["year(date):T", "count():Q"] ) # ### Climate change / global warming # In[19]: climate = df_titles.loc[ df["title"] .dropna() .str.contains(r"(?:climate change|global warming)", regex=True, flags=re.IGNORECASE) ] alt.Chart(climate).mark_line().encode( x="year(date):T", y="count():Q", tooltip=["year(date):T", "count():Q"] ) # ### Trump # In[20]: trump = df_titles.loc[ df["title"].dropna().str.contains(r"\btrump\b", regex=True, flags=re.IGNORECASE) ] alt.Chart(trump).mark_line().encode( x="year(date):T", y="count():Q", tooltip=["year(date):T", "count():Q"] ) # ---- # # Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io/)