#!/usr/bin/env python
# coding: utf-8

# # Files
#  - data is usually stored in secondary storage medium such as hard drive, flash drive, cd-rw, etc. using named locations called files
#  - files can be organized into folders
#  - use open() built-in function to work with files

# In[ ]:


help(open)


# ## write data to a file
# - open file with a name
# - write data
# - close file

# In[ ]:


# old school
fw = open('test.txt', 'a') # w is for write mode
fw.write('words\n=====\n')
fw.write('apple\nball\ncat\ndog\n')
fw.write('elephant\n')
fw.close()


# In[ ]:


# newer and better syntax
with open('words.txt', 'w') as fw:
    fw.write('apple\nball\ncat\ndog\n')
    fw.write('elephant\n')
    fw.write('zebra\n')
# file will be automatically closed when with block is finished executing


# ## read data from a file
# - open file with its name; can provide relative or absolute path
# - read in various ways; one line at a time, all lines, bytes, whole file, etc.
# - use data
# - close file

# In[21]:


# read whole file as list of lines
fr = open('words.txt', 'r') # 'r' or read mode by default; file must exist
data = fr.readlines()
fr.close()


# In[ ]:


help(fr)


# In[22]:


data


# In[23]:


data.sort(reverse=True)
with open('sorted_words.txt', 'w') as newFile: 
    for word in data:
        newFile.write(word)
    

# ## fetch a page from the web

# In[24]:


import urllib.request
url = 'http://org.coloradomesa.edu/~rbasnet/teaching.html'
localfile = 'teaching.html'
urllib.request.urlretrieve(url, localfile)


# ## reading the whole file at once

# In[ ]:


# read /usr/share/dict/words file in linux
# windows path might be " c:/temp/words.txt" or c:\\temp\words.txt"


# In[25]:


with open(localfile) as f:
    data = f.read()
words = data.split(' ')
print('There are {0} words in the file.'.format(len(words)))


# In[26]:


print(words[:10])


# ## parsing HTML using BeautifulSoup library
# - https://www.crummy.com/software/BeautifulSoup/bs4/doc/#
# - Alternative is nltk (Natural Language Toolkit) library
# - http://www.nltk.org/

# In[27]:


from bs4 import BeautifulSoup
localfile = 'teaching.html'
with open(localfile) as f:
    soup = BeautifulSoup(f.read(), 'lxml')
text = soup.get_text()
print(text)


# In[ ]:


# break into lines and remove leading and trailing space on each line
lines = [line.strip() for line in text.splitlines()]


# In[28]:


print(lines)


# In[30]:


# create list of words by spliting multi-word elements
words = [word.strip() for line in lines for word in line.split()]


# In[31]:


print(words)


# In[32]:


print('There are {0} words in the file.'.format(len(words)))


# ## working with binary files

# In[33]:


with open('cmu-logo.png', 'rb') as rbf: #rb - read binary mode
    data = rbf.read() # read the whole binary file
    with open('cmu-logo-copy.png', 'wb') as wbf:
        wbf.write(data) # write the whole binary file
    

# In[34]:


print(len(data))


# In[35]:


# find the size of the data in bytes
import sys
print(sys.getsizeof(data))


# ## exercises
# 1. Write a program that reads a file and writes out a new file with the lines in reversed order (i.e. the first line in the old file becomes the last one in the new file.)
# 2. Write a program that reads a file and prints only those lines that contain the substring snake.
# 3. Write a program that reads a text file and produces an output file which is a copy of the file, except the first five columns of each line contain a four digit line number, followed by a space. Start numbering the first line in the output file at 1. Ensure that every line number is formatted to the same width in the output file. Use one of your Python programs as test data for this exercise: your output should be a printed and numbered listing of the Python program.
# 4. Write a program that undoes the numbering of the previous exercise: it should read a file with numbered lines and produce another file without line numbers.

# In[ ]: