#!/usr/bin/env python
# coding: utf-8

# # 获取必应桌面壁纸
# 
# 网上冲浪的时候，发现了一个不错的必应壁纸网址：
# 
# http://bing.wallpaper.pics
# 
# 可以利用所学的知识，每天定时去这个网站把图片下载下来。
# 
# 先定义一个URL链接对象：

# In[1]:


webpage = "https://bing.wallpaper.pics/us/20220506.html"


# 为了访问这个网址，需要调用urllib模块：

# In[2]:


import urllib.request


# 部分网址会有一些简单的反爬虫策略，需要在使用模块urllib.request时，传入一些访问网页必须的参数，如网页的headers。
# 
# urllib.request模块提供了Request对象实现headers的传入：

# In[3]:


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}


# In[4]:


webpage_url = urllib.request.Request(webpage, headers=headers)


# 利用该对象，可以访问这个网址，并设置超时为20秒：

# In[5]:


with urllib.request.urlopen(webpage_url, timeout=20) as f:
    data = f.read()


# urllib模块返回的结果是一个bytes对象，而不是str，因此需要调用`.decode()`方法将其转换为字符串：

# In[6]:


type(data)


# In[7]:


content = data.decode("utf-8")


# 对于得到的网页内容字符串，可以使用正则表达式提取其中墙纸图片的URL链接：

# In[8]:


import re


# In[9]:


for g in re.finditer("src=\'(//[^\']*.jpg)", content):
    pic_url = "http:" + g.group(1)
    print(pic_url)


# 注意，这个链接的正则表达式是通过观察网页的源代码后编写的，并不是通用的提取url的方法。

# 利用正则表达式，可以从图片URL中解析出图片的名称：

# In[10]:


pic_name = re.search("([^/.&]*\.jpg)", pic_url).group(1)


# In[11]:


pic_name


# 利用`urllib.request.urlretrieve()`函数下载图片：

# In[12]:


urllib.request.urlretrieve(pic_url, pic_name)


# 加载图片：

# In[13]:


from PIL import Image


# In[14]:


pic = Image.open(pic_name)


# In[15]:


pic


# In[16]:


get_ipython().run_line_magic('rm', '*.jpg')


# 由于BING每天都会更新墙纸，为了每次能更新前一天的数据，可以考虑将图片网址的URL改为使用每天的日期计算得到：

# In[17]:


import datetime


# In[18]:


pre_date = datetime.date.today() - datetime.timedelta(days=1)


# In[19]:


webpage = f"https://bing.wallpaper.pics/us/{pre_date.strftime('%Y%m%d')}.html"


# In[20]:


webpage


# 考虑到可能需要获取其他日期的墙纸，在脚本模式下，考虑利用sys.argv接受一个指定参数，从多少天前开始进行循环下载。另一方面，每次下载完图片后，需要利用time模块和random模块随机停止几秒，防止反爬虫机制的触发。
# 
# 综合之前的实现，一个完整程序“wallpaper.py”的实现代码如下：

# In[21]:


get_ipython().run_cell_magic('writefile', 'wallpaper.py', 'import sys\nimport re\nimport datetime\nimport urllib.request\nimport time\nimport random\nfrom pathlib import Path\n\nmax_days = 1\nif len(sys.argv) == 2:\n    max_days = int(sys.argv[1])\n\nheaders = {\'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36\'}\n\nwallpaper = Path("wallpaper")\nif not wallpaper.exists():\n    wallpaper.mkdir()\n\nfor i in range(max_days, 0, -1):\n    cur_date = datetime.date.today() - datetime.timedelta(days=1)\n    webpage = f"https://bing.wallpaper.pics/us/{cur_date.strftime(\'%Y%m%d\')}.html"\n    webpage_url = urllib.request.Request(webpage, headers=headers)\n    print(webpage)\n    with urllib.request.urlopen(webpage_url, timeout=20) as f:\n        data = f.read()\n    content = data.decode("utf-8")\n    for g in re.finditer("src=\\\'(//[^\\\']*.jpg)", content):\n        pic_url = "http:" + g.group(1)\n        print(pic_url)\n        pic_name = re.search("([^/.&]*\\.jpg)", pic_url).group(1)\n        if not (wallpaper / pic_name).exists():\n            urllib.request.urlretrieve(pic_url, wallpaper / pic_name)\n    time.sleep(random.randint(1, 5))\n')


# In[22]:


get_ipython().run_line_magic('rm', 'wallpaper.py')


# In[ ]: