#!/usr/bin/env python # coding: utf-8 # In[ ]: import wikipedia import pandas as pd from bs4 import BeautifulSoup import requests import cv2 import sklearn from skimage import io import numpy as np from matplotlib import pyplot as plt from matplotlib import colors import os import json from sklearn.cluster import KMeans from colour_segmentation.base.segmentation_algorithm import SegmentationAlgorithm from colour_segmentation.segmentator import Segmentator from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" get_ipython().run_line_magic('config', 'Completer.use_jedi = False') # # Look at data for Renoir's paintings - especially the colours # # Uses this [Wikipedia list](https://en.wikipedia.org/wiki/List_of_paintings_by_Pierre-Auguste_Renoir). We'll parse the tables for each decade and get the data. We'll download all the original files of the paintings. # # We'll also extract the colours from each painting to cluster them. # In[527]: # Read the page and its HTML page = wikipedia.page('List of paintings by Pierre-Auguste Renoir') html = page.html() # Read the list of Renoir's paintings by decade one by one # note there's two more tables at the bottom, they're not interesting to this page_tables = pd.read_html(html)[0:6] # separate DFs by decade df_1860s, df_1870s, df_1880s, df_1890s, df_1900s, df_1910s = page_tables # In[528]: # manually removing one from 1870 as it doesn't have an image file df_1870s = df_1870s[df_1870s.Title != 'Conversation with the Gardener'] # In[529]: # the above doesn't give the URLs of images attached to painting titles # (the "Picture" field is NaN), those are in the images field of the page but they're not in order # page.images # so we shall use BeautifulSoup directly to parse those and attach to the dfs soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string # these are the decade tables table_1860, table_1870, table_1880, table_1890, table_1900, table_1910 = soup.find_all('table')[0:6] # and here we get the HREFs to the imgs for each images = table_1860.find_all('a', class_='image') df_1860s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] images = table_1870.find_all('a', class_='image') df_1870s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] images = table_1880.find_all('a', class_='image') df_1880s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] images = table_1890.find_all('a', class_='image') df_1890s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] images = table_1900.find_all('a', class_='image') df_1900s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] images = table_1910.find_all('a', class_='image') df_1910s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' for image in images] # In[530]: #images[-1] #images[-1].find('img')['src'].split('thumb/')[1].split('.jpg')[0] # In[535]: df_1860s # # Now download all imgs # # Download the original files, from wikimedia. Filenames need prefixed. The lookup for the filename to use and how to build the URL to it has been investigated and tested. # # If there's any that doesn't get downloaded (e.g for cases where it's not a .jpg) I try fix manually. Or leave it be if I can't. # In[76]: base_url ='https://upload.wikimedia.org/wikipedia/commons/' # # need to spoof it as a browser user agent otherwise wikimedia # r = requests.get('https://upload.wikimedia.org/wikipedia/commons/6/65/Pierre-Auguste_Renoir_-_Portrait_de_la_m%C3%A8re_de_Renoir.jpg', # headers={'User-agent': 'Mozilla/5.0'}) # do one at a time - it's relatively slow for index, row in df_1910s.iterrows(): img_url = base_url + row['original_filename'] print(img_url) r = requests.get(img_url, headers={'User-agent': 'Mozilla/5.0'}) f = open('renoir_1910/' + row['Title'] + '.jpeg', 'wb') f.write(r.content) f.close() # In[88]: len(df_1910s) # In[87]: get_ipython().system('ls renoir_1910/ | wc -l') # ## Now investigate the colours! # # Using a k-means with 5 clusters asked for for each picture # In[237]: get_ipython().run_cell_magic('time', '', "\nkmeans = KMeans(n_clusters=20)\n\n# do one by one\ndir_ = 'renoir_1890/'\nd = {}\n\nfor filename in os.listdir(dir_):\n \n try:\n \n print(filename)\n\n img = io.imread(dir_ + filename)\n\n # resize (to half dimensions) so to reduce data, for speed\n img = cv2.resize(img, (0,0), fx=0.2, fy=0.2)\n img = np.reshape(img, (img.shape[0]*img.shape[1], 3))\n\n kmeans.fit(img)\n\n d[filename] = {'centroids': kmeans.cluster_centers_.tolist(),\n 'labels': kmeans.labels_.tolist()}\n \n # there can be some in wrong format \n except:\n pass\n") # In[238]: len(d) # In[239]: get_ipython().system('ls renoir_1890/ | wc -l') # In[240]: # dump to file json.dump(d, open('renoir_1890_clusters_20.json', 'w')) # ## Read all files in, build single dict # # For the RGB values, cast to int. For the labels, compute the occupation (in %) of each. # In[256]: d_decades = {'1860': {}, '1870': {}, '1880': {}, '1890': {}, '1900': {}, '1910': {}} for decade in d_decades: print(decade) d = json.load(open('renoir_{decade}_clusters.json'.format(decade=decade), 'r')) for title in d: d_decades[decade][title] = [] for i in range(len(d[title]['centroids'])): d_decades[decade][title].append( {'colour': [int(item) for item in d[title]['centroids'][i]], 'occupation': round(d[title]['labels'].count(i) / len(d[title]['labels']), 2) }) # In[257]: d_decades['1880'].keys() # In[645]: d_decades['1880']['Portrait of Charles and George Durand-Ruel(French: Portrait de Charles et George Durand-Ruel).jpeg'] # In[210]: title = 'Girl Playing Croquet(French/ Fille jouant au croquet).jpeg' # In[245]: # spit a jpeg for each decade with the kmeans colours of each title for decade in d_decades: print(decade) for title in d_decades[decade]: print(title) plt.figure(figsize=(20,20)) fig, axs = plt.subplots(1, 20) l_ = d_decades[decade][title] # list of dicts for the title # sort by occupation down l_ = sorted(l_, key=lambda d: d['occupation'],reverse=True) for i in range(20): print(l_[i]) _ = axs[i].imshow(np.full((1,1,3), l_[i]['colour'])); plt.savefig('colours_20_' + decade + '/' + title.split('.')[0] + '_colours.jpeg') # In[255]: d_decades['1880'] # In[236]: data # In[251]: # data = np.random.rand(10, 10) * 20 # # create discrete colormap # cmap = colors.ListedColormap(['red', 'blue']) # bounds = [0,10,20] # norm = colors.BoundaryNorm(bounds, cmap.N) # fig, ax = plt.subplots() # ax.imshow(data, cmap=cmap, norm=norm); # # draw gridlines # ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2) # ax.set_xticks(np.arange(-.5, 10, 1)); # ax.set_yticks(np.arange(-.5, 10, 1)); # plt.show() # ## Other method: compare image to some palettes? # # ### Renoir's palette # # What was Renoir's standard palette is info I got from [here](https://www.jacksonsart.com/blog/2014/08/12/colours-used-sargent-renoir/) and [here](https://webartacademy.com/renoirs-palette) (taken from this one really). # # The k-means methos is too coarse, gives us colours that are too similar painting by painting. Instead, what about we measure how much each painting is using of his palette. # # The RGB of these colours have been chosen with a quick google search. # # #### Some interesting videos about these colours: # * [flake white](https://www.youtube.com/watch?v=vx6BI3Fqr6U) # # This won't do - too subjective, there's no green, plus it's his, so it is derived from his own paintings, hence circular reasoning # # ### A standard palette? # # # ### Saturated colours + # ### Renoir's palette colours # In[223]: # renoir's palette d_r_palette = { 'flake_white':[236, 236, 236], 'naples_yellow':[250 ,218, 94], 'viridian':[255, 87, 51], 'ivory_black':[35, 31, 32], 'natural_earth':[128, 96, 67], 'yellow_ochre':[245,197,44], 'carmine':[247, 74, 70], # cannot find superfine carmine 'venetian_red':[200, 8, 21], 'cobalt_blue':[0, 71, 171], 'lake_red':[184, 78, 112] } plt.figure(figsize=(10,10)) fig, axs = plt.subplots(1, 10) l_ = list(d_r_palette.values()) # list of dicts for the title for i in range(10): print(l_[i]) _ = axs[i].imshow(np.full((1,1,3), l_[i])); # In[249]: img = io.imread('renoir_1880/The Two Sisters(French: Les deux soeurs).jpeg') plt.imshow(img) # In[260]: # for row in img: # print(row) # ## Trying image segmentation # # with proper techniques, using this [lib](https://pypi.org/project/colour-segmentation/) # In[261]: # In[518]: img = cv2.imread("renoir_1910/Portrait of Ambroise Vollard.jpeg") # the lib is designed to work with OpenCV, so reading BGR img = cv2.resize(img, (0,0), fx=0.2, fy=0.2) # for speed segmentator = Segmentator(image=img) # In[519]: # result_liu = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_LIU, # apply_colour_correction=False, # remove_achromatic_colours=True) result_amante_fonseca_achr = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_CHAMORRO, remove_achromatic_colours=False) # In[520]: result_amante_fonseca_achr.segmented_classes.max() sum([result_amante_fonseca_achr.get_colour_proportion(i) for i in range(len(colours_cm))]) [result_amante_fonseca_achr.get_colour_proportion(i) for i in range(len(colours_cm))] result_amante_fonseca_achr.get_colour_proportion(1) # In[521]: plt.imshow(cv2.cvtColor(result_amante_fonseca_achr.segmented_image, cv2.COLOR_BGR2RGB)) # ### Systematically and with all methods # # Each method uses a different set of colours. The colours for each method are taken from the lib docs [here](https://mmunar97.gitbook.io/colour-segmentation/examples/examples-of-fuzzy-logic-based-methods) # # We'll be useing each method with the flag that removes the achromatic colours (white, black, grey). # In[329]: # Amante-Fonseca colours_af = { 0: [255, 33, 36], # red 1: [170, 121, 66], # brown 2: [255, 146, 0], # orange 3: [255, 251, 0], # yellow 4: [0, 255, 0], # green 5: [0, 253, 255], # cyan 6: [0, 0, 255], # blue 7: [147, 33, 146], # purple 8: [255, 64, 255] # pink } # Chamorro - Martinez colours_cm = { 0: [255, 33, 36], # red 1: [255, 148, 9], # orange 2: [255, 255, 13], # yellow 3: [186, 255, 15], # yellow-green 4: [6, 155, 9], # green 5: [12, 255, 116], # green-cyan 6: [11, 254, 255], # cyan 7: [8, 192, 255], # cyan-blue 8: [0, 0, 255], # blue 9: [92, 8, 253], # blue-magenta 10: [238, 3, 249], # magenta 11: [254, 6, 180] # magenta-red } # Liu-Wang colours_lw = { 0: [255, 33, 36], # red 1: [248, 149, 29], #orange 2: [239, 233, 17], # yellow 3: [105, 189, 69], # green 4: [111, 204, 221], # cyan 5: [59, 83, 164], # blue 6: [158, 80, 159] # purple } # Shamir colours_s = { 0: [255, 33, 36], # red 1: [255, 140, 0], # dark orange 2: [255, 165, 0], # light orange 3: [255, 255, 0], # yellow 4: [144, 238, 144], # light green 5: [0, 100, 0], # dark green 6: [0, 255, 255], # aqua 7: [0, 0, 255], # blue 8: [128, 0, 128], # dark purple 9: [255, 0, 255] # light purple } # In[367]: # run the same logic as in the example above on all imgs, save occupation to file # run all segmentor decades = ['1860', '1870', '1880', '1890', '1900', '1910'] d_af = {decade: {} for decade in decades} d_cm = {decade: {} for decade in decades} d_lw = {decade: {} for decade in decades} d_s = {decade: {} for decade in decades} for decade in decades: print(decade) for filename in os.listdir('renoir_{decade}'.format(decade=decade)): #print(filename) try: img = cv2.imread("renoir_{decade}/{filename}".format(decade=decade, filename=filename)) # resize for speed img = cv2.resize(img, (0,0), fx=0.2, fy=0.2) d_af[decade][filename] = {} d_cm[decade][filename] = {} d_lw[decade][filename] = {} d_s[decade][filename] = {} # Amante-Fonseca segmentator = Segmentator(image=img) result_af = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_AMANTE, remove_achromatic_colours=False) # I tried True but gives not-normalised values (sum if occupations isn't 1) for k in colours_af: d_af[decade][filename][k] = result_af.get_colour_proportion(k) # Chamorro-Martinez segmentator = Segmentator(image=img) result_cm = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_CHAMORRO, remove_achromatic_colours=False) for k in colours_cm: d_cm[decade][filename][k] = result_cm.get_colour_proportion(k) # Liu-Wang segmentator = Segmentator(image=img) result_lw = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_LIU, apply_colour_correction=False, remove_achromatic_colours=False) for k in colours_lw: d_lw[decade][filename][k] = result_lw.get_colour_proportion(k) # Shamir segmentator = Segmentator(image=img) result_s = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_SHAMIR, remove_achromatic_colours=False) for k in colours_s: d_s[decade][filename][k] = result_s.get_colour_proportion(k) except: pass # In[368]: sum(d_af['1860']['Chalands sur la Seine (Barges on the Seine).jpeg'].values()) # In[378]: # d_lw['1880'].keys() # sum(d_s['1880']['Blonde Bather (1881)(French: La baigneuse blonde).jpeg'].values()) # In[400]: len(d_af['1910']) # ## Round up occupations # # To 2 decimal digits - this is for ease of eye-balling # In[539]: for decade in decades: for title in d_cm[decade]: for k in d_cm[decade][title]: d_cm[decade][title][k] = round(d_cm[decade][title][k], 2) # ## Look at aggregated measures from this data # In[540]: # eye-ball means of colours by decade for one of the colour methods decade = '1860' print(decade) for k in colours_af: print(k, np.mean([d_af[decade][title][k] for title in d_af[decade]]), np.std([d_af[decade][title][k] for title in d_af[decade]])) decade = '1870' print('\n', decade) for k in colours_af: print(k, np.mean([d_af[decade][title][k] for title in d_af[decade]]), np.std([d_af[decade][title][k] for title in d_af[decade]])) # In[542]: colours_cm # In[543]: # plotting bars - these are coloured with the RGBs of the colour from colour method d_plot = d_cm.copy() # choose the colour method here d_colours = colours_cm.copy() # and the relative colours dict fig, axs = plt.subplots(3, 2, figsize=(10, 10)) fig.suptitle('Mean occupation of colour for colour method') decade = '1860' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[0,0].barh( x, y, color=colours) axs[0,0].set_title(decade) decade = '1870' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[0,1].barh( x, y , color=colours) axs[0,1].set_title(decade) decade = '1880' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[1,0].barh( x, y , color=colours) axs[1,0].set_title(decade) decade = '1890' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[1,1].barh( x, y , color=colours) axs[1,1].set_title(decade) decade = '1900' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[2,0].barh( x, y , color=colours) axs[2,0].set_title(decade) decade = '1910' x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()] y = [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x] colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot axs[2,1].barh( x, y , color=colours) axs[2,1].set_title(decade) plt.show(); #x, y, colours # In[544]: # tot of titles processed for colour len(d_af['1860'].keys()) + \ len(d_af['1870'].keys()) + \ len(d_af['1880'].keys()) + \ len(d_af['1890'].keys()) + \ len(d_af['1900'].keys()) + \ len(d_af['1910'].keys()) # In[740]: d_cm['1880'].keys() # In[715]: len(d_cm['1910']) # In[741]: d_cm['1880']["Les grandes baigneuses(The Large Bathers).jpeg"] # In[742]: df_1880s.iloc[:] # ## Other refs # # * A nice [page](https://realpython.com/python-opencv-color-spaces/) on image segmentation with Python # * [How to choose a palette for painting](https://www.artsy.net/article/artsy-editorial-4-colors-excel-painting) # * [The colour palettes of 10 famous paintings](https://www.haydnsymons.com/blog/the-colour-palettes-used-by-10-famous-paintings/) # In[ ]: