diff --git a/scrap_data.sh b/scrap_data.sh new file mode 100755 index 0000000..7fe49f1 --- /dev/null +++ b/scrap_data.sh @@ -0,0 +1,53 @@ +#!/bin/sh + +# +# This script scraps some pokémon pictures from Bulbapedia. +# + +bulbapedia_page_url="http://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_Kanto_Pok%C3%A9dex_number" +bulbapedia_page_name="bulbapedia.html" +scrap_folder="`pwd`/scrapped-data" + +# Make sure the directory for the scrapped data is there. +mkdir -p "$scrap_folder" + +# Download the bulbapedia page if it doesn't already. +if [ ! -e "scrapped-data/bulbapedia.html" ]; then + echo " > Downloading '$bulbapedia_page_url' to '$scrap_folder/$bulbapedia_page_name' ..." + wget "$bulbapedia_page_url" -O "$scrap_folder/$bulbapedia_page_name" -q + echo " > Downloaded." +fi + +# Dear furure me, +# +# If you are in need to maintain this part of the code... I am +# realy sorry for you (T.T). This was the best I could do... But +# I will try to explain things here a little bit. +# 'cat' will read the file and pipe its output to 'sed'. 'sed' +# will filter the html searching for the Pokémon name and its +# image url. 'sed' will output the Pokémons in this format: +# "=". +# Then, the output of 'sed' goes into the while loop, which will +# read the output one line at a time. Within the while loop, I +# extract the pokemon name and the url from the read line. And +# then, it just downloads the url to a file. +# Again... I'm sorry for all the trouble. But I hope you will +# grow stronger and may be able to turn this code into something +# more readable. +# +# Kind regards, +# Yourself from the past. + +cat "$scrap_folder/$bulbapedia_page_name" | \ +sed -nr 's;^.*(.*).*$;\1=\2;p' | \ +while read line +do + pokemon_name="${line%=*}" + pokemon_url="${line#*=}" + + # Unescape HTML characters... Damn "Farfetch'd". + pokemon_name=$(echo "$pokemon_name" | sed "s/'/'/") + + echo " > Downloading '$pokemon_name' from '$pokemon_url' to '$scrap_folder/$pokemon_name.png' ..." + wget "$pokemon_url" -O "$scrap_folder/$pokemon_name.png" -q +done diff --git a/scrap_pokemon.py b/scrap_pokemon.py deleted file mode 100755 index 187023b..0000000 --- a/scrap_pokemon.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/python3 - -import re -import sys -import html -import urllib.request - -# Load the pokemon sprites page. -with open('scrapped-data/bulbapedia.html', 'r') as page: - html_page = "".join(page.readlines()); - -# Find all pokemon name and image urls. -image_regex = r'(.*)' -all_pokemon = re.findall(image_regex, html_page) - -# Save the image of each pokemon. -for pokemon in all_pokemon: - # Unpack the tuple data. - name, image_url = pokemon - - # Clean HTML escape sequences in the name - name = html.unescape(name) - - # Set file path for the image. - file_path = './scrapped-data/' + name + '.png' - - # Tell the user what we are doing here. - print('Downloading "{}" image to "{}"...'.format(name, file_path)) - - # Download the image. - with open(file_path, 'wb') as pokemon_file: - with urllib.request.urlopen(image_url) as pokemon_image: - pokemon_file.write(pokemon_image.read())