#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import wikipedia
import pandas as pd
from bs4 import BeautifulSoup
import requests

import cv2
import sklearn
from skimage import io
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors

import os
import json

from sklearn.cluster import KMeans

from colour_segmentation.base.segmentation_algorithm import SegmentationAlgorithm
from colour_segmentation.segmentator import Segmentator

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
get_ipython().run_line_magic('config', 'Completer.use_jedi = False')


# # Look at data for Renoir's paintings - especially the colours
# 
# Uses this [Wikipedia list](https://en.wikipedia.org/wiki/List_of_paintings_by_Pierre-Auguste_Renoir). We'll parse the tables for each decade and get the data. We'll download all the original files of the paintings.
# 
# We'll also extract the colours from each painting to cluster them.

# In[527]:


# Read the page and its HTML

page = wikipedia.page('List of paintings by Pierre-Auguste Renoir')
html = page.html()

# Read the list of Renoir's paintings by decade one by one
# note there's two more tables at the bottom, they're not interesting to this
page_tables = pd.read_html(html)[0:6]

# separate DFs by decade
df_1860s, df_1870s, df_1880s, df_1890s, df_1900s, df_1910s = page_tables


# In[528]:


# manually removing one from 1870 as it doesn't have an image file
df_1870s = df_1870s[df_1870s.Title != 'Conversation with the Gardener']


# In[529]:


# the above doesn't give the URLs of images attached to painting titles
# (the "Picture" field is NaN), those are in the images field of the page but they're not in order
# page.images

# so we shall use BeautifulSoup directly to parse those and attach to the dfs
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string

# these are the decade tables
table_1860, table_1870, table_1880, table_1890, table_1900, table_1910 = soup.find_all('table')[0:6]

# and here we get the HREFs to the imgs for each
images = table_1860.find_all('a', class_='image')
df_1860s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg'
                                 for image in images]

images = table_1870.find_all('a', class_='image')
df_1870s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' 
                                 for image in images]

images = table_1880.find_all('a', class_='image')
df_1880s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' 
                                 for image in images]

images = table_1890.find_all('a', class_='image')
df_1890s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' 
                                 for image in images]

images = table_1900.find_all('a', class_='image')
df_1900s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' 
                                 for image in images]

images = table_1910.find_all('a', class_='image')
df_1910s['original_filename'] = [image.find('img')['src'].split('thumb/')[1].split('.jpg')[0] + '.jpg' 
                                 for image in images]


# In[530]:


#images[-1]
#images[-1].find('img')['src'].split('thumb/')[1].split('.jpg')[0]


# In[535]:


df_1860s


# # Now download all imgs
# 
# Download the original files, from wikimedia. Filenames need prefixed. The lookup for the filename to use and how to build the URL to it has been investigated and tested.
# 
# If there's any that doesn't get downloaded (e.g for cases where it's not a .jpg) I try fix manually. Or leave it be if I can't.

# In[76]:


base_url ='https://upload.wikimedia.org/wikipedia/commons/'

# # need to spoof it as a browser user agent otherwise wikimedia 
# r = requests.get('https://upload.wikimedia.org/wikipedia/commons/6/65/Pierre-Auguste_Renoir_-_Portrait_de_la_m%C3%A8re_de_Renoir.jpg', 
#                  headers={'User-agent': 'Mozilla/5.0'})


# do one at a time - it's relatively slow
for index, row in df_1910s.iterrows():
    img_url = base_url + row['original_filename']
    print(img_url)
    r = requests.get(img_url, headers={'User-agent': 'Mozilla/5.0'})
    f = open('renoir_1910/' + row['Title'] + '.jpeg', 'wb')
    f.write(r.content)
    f.close()


# In[88]:


len(df_1910s)


# In[87]:


get_ipython().system('ls renoir_1910/ | wc -l')


# ## Now investigate the colours!
# 
# Using a k-means with 5 clusters asked for for each picture

# In[237]:


get_ipython().run_cell_magic('time', '', "\nkmeans = KMeans(n_clusters=20)\n\n# do one by one\ndir_ = 'renoir_1890/'\nd = {}\n\nfor filename in os.listdir(dir_):\n    \n    try:\n    \n        print(filename)\n\n        img = io.imread(dir_ + filename)\n\n        # resize (to half dimensions) so to reduce data, for speed\n        img = cv2.resize(img, (0,0), fx=0.2, fy=0.2)\n        img = np.reshape(img, (img.shape[0]*img.shape[1], 3))\n\n        kmeans.fit(img)\n\n        d[filename] = {'centroids': kmeans.cluster_centers_.tolist(),\n                       'labels': kmeans.labels_.tolist()}\n        \n    # there can be some in wrong format    \n    except:\n        pass\n")


# In[238]:


len(d)


# In[239]:


get_ipython().system('ls renoir_1890/ | wc -l')


# In[240]:


# dump to file
json.dump(d, open('renoir_1890_clusters_20.json', 'w'))


# ##  Read all files in, build single dict
# 
# For the RGB values, cast to int. For the labels, compute the occupation (in %) of each.

# In[256]:


d_decades = {'1860': {}, '1870': {}, '1880': {}, '1890': {}, '1900': {}, '1910': {}}

for decade in d_decades:
    print(decade)
    d = json.load(open('renoir_{decade}_clusters.json'.format(decade=decade), 'r'))
    for title in d:
        d_decades[decade][title] = []
        for i in range(len(d[title]['centroids'])):
            d_decades[decade][title].append(
                {'colour': [int(item) for item in d[title]['centroids'][i]],
                 'occupation': round(d[title]['labels'].count(i) / len(d[title]['labels']), 2)
                })


# In[257]:


d_decades['1880'].keys()


# In[645]:


d_decades['1880']['Portrait of Charles and George Durand-Ruel(French: Portrait de Charles et George Durand-Ruel).jpeg']


# In[210]:


title = 'Girl Playing Croquet(French/ Fille jouant au croquet).jpeg'


# In[245]:


# spit a jpeg for each decade with the kmeans colours of each title
for decade in d_decades:
    print(decade)
    for title in d_decades[decade]:
        print(title)
        plt.figure(figsize=(20,20))

        fig, axs = plt.subplots(1, 20)

        l_ = d_decades[decade][title]    # list of dicts for the title
        # sort by occupation down
        l_ = sorted(l_, key=lambda d: d['occupation'],reverse=True)
        for i in range(20):
            print(l_[i])
            _ = axs[i].imshow(np.full((1,1,3), l_[i]['colour']));

        plt.savefig('colours_20_' + decade + '/' + title.split('.')[0] + '_colours.jpeg')


# In[255]:


d_decades['1880']


# In[236]:


data


# In[251]:


# data = np.random.rand(10, 10) * 20

# # create discrete colormap
# cmap = colors.ListedColormap(['red', 'blue'])
# bounds = [0,10,20]
# norm = colors.BoundaryNorm(bounds, cmap.N)

# fig, ax = plt.subplots()
# ax.imshow(data, cmap=cmap, norm=norm);

# # draw gridlines
# ax.grid(which='major', axis='both', linestyle='-', color='k', linewidth=2)
# ax.set_xticks(np.arange(-.5, 10, 1));
# ax.set_yticks(np.arange(-.5, 10, 1));

# plt.show()


# ## Other method: compare image to some palettes?
# 
# ### Renoir's palette
# 
# What was Renoir's standard palette is info I got from [here](https://www.jacksonsart.com/blog/2014/08/12/colours-used-sargent-renoir/) and [here](https://webartacademy.com/renoirs-palette) (taken from this one really). 
# 
# The k-means methos is too coarse, gives us colours that are too similar painting by painting. Instead, what about we measure how much each painting is using of his palette.
# 
# The RGB of these colours have been chosen with a quick google search.
# 
# #### Some interesting videos about these colours:
# * [flake white](https://www.youtube.com/watch?v=vx6BI3Fqr6U)
# 
# This won't do - too subjective, there's no green, plus it's his, so it is derived from his own paintings, hence circular reasoning
# 
# ### A standard palette?
# 
# 
# ### Saturated colours + 

# ### Renoir's palette colours

# In[223]:


# renoir's palette
d_r_palette = {
    'flake_white':[236, 236, 236],
    'naples_yellow':[250 ,218, 94],
    'viridian':[255, 87, 51],
    'ivory_black':[35, 31, 32],
    'natural_earth':[128, 96, 67],
    'yellow_ochre':[245,197,44],
    'carmine':[247, 74, 70], # cannot find superfine carmine
    'venetian_red':[200, 8, 21],
    'cobalt_blue':[0, 71, 171],
    'lake_red':[184, 78, 112]
}

plt.figure(figsize=(10,10))

fig, axs = plt.subplots(1, 10)

l_ = list(d_r_palette.values())   # list of dicts for the title

for i in range(10):
    print(l_[i])
    _ = axs[i].imshow(np.full((1,1,3), l_[i]));


# In[249]:


img = io.imread('renoir_1880/The Two Sisters(French: Les deux soeurs).jpeg')

plt.imshow(img)


# In[260]:


# for row in img:
#     print(row)


# ## Trying image segmentation
# 
# with proper techniques, using this [lib](https://pypi.org/project/colour-segmentation/)

# In[261]:


# In[518]:


img = cv2.imread("renoir_1910/Portrait of Ambroise Vollard.jpeg")   # the lib is designed to work with OpenCV, so reading BGR
img = cv2.resize(img, (0,0), fx=0.2, fy=0.2)    # for speed

segmentator = Segmentator(image=img)


# In[519]:


# result_liu = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_LIU,
#                                  apply_colour_correction=False,
#                                  remove_achromatic_colours=True)

result_amante_fonseca_achr = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_CHAMORRO,
                                                 remove_achromatic_colours=False)


# In[520]:


result_amante_fonseca_achr.segmented_classes.max()
sum([result_amante_fonseca_achr.get_colour_proportion(i) for i in range(len(colours_cm))])
[result_amante_fonseca_achr.get_colour_proportion(i) for i in range(len(colours_cm))]
result_amante_fonseca_achr.get_colour_proportion(1)


# In[521]:


plt.imshow(cv2.cvtColor(result_amante_fonseca_achr.segmented_image, cv2.COLOR_BGR2RGB))


# ### Systematically and with all methods
# 
# Each method uses a different set of colours. The colours for each method are taken from the lib docs [here](https://mmunar97.gitbook.io/colour-segmentation/examples/examples-of-fuzzy-logic-based-methods)
# 
# We'll be useing each method with the flag that removes the achromatic colours (white, black, grey).

# In[329]:


# Amante-Fonseca
colours_af = {
    0: [255, 33, 36],     # red
    1: [170, 121, 66],    # brown
    2: [255, 146, 0],      # orange
    3: [255, 251, 0],       # yellow
    4: [0, 255, 0],          # green
    5: [0, 253, 255],       # cyan
    6: [0, 0, 255],        # blue
    7: [147, 33, 146],     # purple
    8: [255, 64, 255]      # pink
}

# Chamorro - Martinez
colours_cm = {
    0: [255, 33, 36],     # red
    1: [255, 148, 9],      # orange
    2: [255, 255, 13],      # yellow
    3: [186, 255, 15],      # yellow-green
    4: [6, 155, 9],        # green
    5: [12, 255, 116],      # green-cyan
    6: [11, 254, 255],       # cyan
    7: [8, 192, 255],        # cyan-blue
    8: [0, 0, 255],         # blue
    9: [92, 8, 253],         # blue-magenta
    10: [238, 3, 249],       # magenta
    11: [254, 6, 180]        # magenta-red
}

# Liu-Wang
colours_lw = {
    0: [255, 33, 36],       # red
    1: [248, 149, 29],       #orange
    2: [239, 233, 17],       # yellow
    3: [105, 189, 69],        # green
    4: [111, 204, 221],       # cyan
    5: [59, 83, 164],          # blue
    6: [158, 80, 159]        # purple
}

# Shamir
colours_s = {
    0: [255, 33, 36],       # red
    1: [255, 140, 0],       # dark orange
    2: [255, 165, 0],        # light orange
    3: [255, 255, 0],        # yellow
    4: [144, 238, 144],      # light green
    5: [0, 100, 0],          # dark green
    6: [0, 255, 255],        # aqua
    7: [0, 0, 255],          # blue
    8: [128, 0, 128],        # dark purple
    9: [255, 0, 255]         # light purple
}


# In[367]:


# run the same logic as in the example above on all imgs, save occupation to file
# run all segmentor

decades = ['1860', '1870', '1880', '1890', '1900', '1910']

d_af = {decade: {} for decade in decades}
d_cm = {decade: {} for decade in decades}
d_lw = {decade: {} for decade in decades}
d_s = {decade: {} for decade in decades}

for decade in decades:
    print(decade)
    for filename in os.listdir('renoir_{decade}'.format(decade=decade)):
        #print(filename)
        try:
        
            img = cv2.imread("renoir_{decade}/{filename}".format(decade=decade, filename=filename))  

            # resize for speed
            img = cv2.resize(img, (0,0), fx=0.2, fy=0.2) 

            d_af[decade][filename] = {}
            d_cm[decade][filename] = {}
            d_lw[decade][filename] = {}
            d_s[decade][filename] = {}

            # Amante-Fonseca
            segmentator = Segmentator(image=img)
            result_af = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_AMANTE,
                                            remove_achromatic_colours=False)   # I tried True but gives not-normalised values (sum if occupations isn't 1)
            for k in colours_af:
                d_af[decade][filename][k] = result_af.get_colour_proportion(k)

            # Chamorro-Martinez
            segmentator = Segmentator(image=img)
            result_cm = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_CHAMORRO,
                                            remove_achromatic_colours=False)
            for k in colours_cm:
                d_cm[decade][filename][k] = result_cm.get_colour_proportion(k)

            # Liu-Wang
            segmentator = Segmentator(image=img)
            result_lw = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_LIU,
                                            apply_colour_correction=False,
                                            remove_achromatic_colours=False)
            for k in colours_lw:
                d_lw[decade][filename][k] = result_lw.get_colour_proportion(k)

            # Shamir
            segmentator = Segmentator(image=img)
            result_s = segmentator.segment(method=SegmentationAlgorithm.FUZZY_SET_SHAMIR, 
                                           remove_achromatic_colours=False)
            for k in colours_s:
                d_s[decade][filename][k] = result_s.get_colour_proportion(k)
                
        except:
            pass


# In[368]:


sum(d_af['1860']['Chalands sur la Seine (Barges on the Seine).jpeg'].values())


# In[378]:


# d_lw['1880'].keys()
# sum(d_s['1880']['Blonde Bather (1881)(French: La baigneuse blonde).jpeg'].values())


# In[400]:


len(d_af['1910'])


# ## Round up occupations
# 
# To 2 decimal digits - this is for ease of eye-balling

# In[539]:


for decade in decades:
    for title in d_cm[decade]:
        for k in d_cm[decade][title]:
            d_cm[decade][title][k] = round(d_cm[decade][title][k], 2)


# ## Look at aggregated measures from this data

# In[540]:


# eye-ball means of colours by decade for one of the colour methods

decade = '1860'
print(decade)
for k in colours_af:
    print(k, np.mean([d_af[decade][title][k] for title in d_af[decade]]), 
          np.std([d_af[decade][title][k] for title in d_af[decade]]))

decade = '1870'
print('\n', decade)
for k in colours_af:
    print(k, np.mean([d_af[decade][title][k] for title in d_af[decade]]), 
          np.std([d_af[decade][title][k] for title in d_af[decade]]))


# In[542]:


colours_cm


# In[543]:


# plotting bars - these are coloured with the RGBs of the colour from colour method
d_plot = d_cm.copy()      # choose the colour method here
d_colours = colours_cm.copy()    # and the relative colours dict

fig, axs = plt.subplots(3, 2, figsize=(10, 10))
fig.suptitle('Mean occupation of colour for colour method')

decade = '1860'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x]   # this must be a list of tuples, as per pyplot
axs[0,0].barh( x, y, color=colours)
axs[0,0].set_title(decade)

decade = '1870'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x]   # this must be a list of tuples, as per pyplot
axs[0,1].barh( x, y , color=colours)
axs[0,1].set_title(decade)

decade = '1880'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot
axs[1,0].barh( x, y , color=colours)
axs[1,0].set_title(decade)

decade = '1890'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot
axs[1,1].barh( x, y , color=colours)
axs[1,1].set_title(decade)

decade = '1900'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot
axs[2,0].barh( x, y , color=colours)
axs[2,0].set_title(decade)

decade = '1910'
x = [k for k in d_plot[decade][next(iter(d_plot[decade]))].keys()]
y =  [np.mean([d_plot[decade][title][k] for title in d_plot[decade]]) for k in x]
colours = [ tuple([item/255 for item in d_colours[i]]) for i in x] # this must be a list of tuples, as per pyplot
axs[2,1].barh( x, y , color=colours)
axs[2,1].set_title(decade)

plt.show();

#x, y, colours


# In[544]:


# tot of titles processed for colour
len(d_af['1860'].keys()) + \
len(d_af['1870'].keys()) + \
len(d_af['1880'].keys()) + \
len(d_af['1890'].keys()) + \
len(d_af['1900'].keys()) + \
len(d_af['1910'].keys())


# In[740]:


d_cm['1880'].keys()


# In[715]:


len(d_cm['1910'])


# In[741]:


d_cm['1880']["Les grandes baigneuses(The Large Bathers).jpeg"]


# In[742]:


df_1880s.iloc[:]


# ## Other refs
# 
# * A nice [page](https://realpython.com/python-opencv-color-spaces/) on image segmentation with Python 
# * [How to choose a palette for painting](https://www.artsy.net/article/artsy-editorial-4-colors-excel-painting)
# * [The colour palettes of 10 famous paintings](https://www.haydnsymons.com/blog/the-colour-palettes-used-by-10-famous-paintings/)

# In[ ]: