Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Omniglot dataset #8

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions data/omniglot/preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -x

# download data and convert to .json format
if [ ! -d "data/all_data" ] || [ ! "$(ls -A data/all_data)" ]; then
pushd preprocess
./data_to_json.sh
popd
fi

NAME="omniglot" # name of the dataset, equivalent to directory name

cd ../utils

./preprocess.sh --name $NAME $@

cd ../$NAME
45 changes: 45 additions & 0 deletions data/omniglot/preprocess/data_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import json
import glob
import numpy as np

from PIL import Image
from collections import defaultdict

image_size = (28, 28)
status_update_after = 5000 # images processed

user_class = dict()
user_data = defaultdict(dict)

parent_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
image_paths = os.path.join(parent_path, 'data', 'raw_data', 'images_*', '*', '*', '*.png')

for i, character_file in enumerate(glob.glob(image_paths)):
character_class = '.'.join(character_file.split('/')[-4:-1])
user_id = character_file.split('/')[-1].split('_')[0]
# instance_num = character_file.split('/')[-1].split('_')[1].split('.')[0]

img = Image.open(character_file).resize(image_size, resample=Image.LANCZOS)
flattened_img = np.array(img.convert('L')).flatten() / 255.

if user_id not in user_class:
user_class[user_id] = character_class
user_data[user_id]['x'] = list()
user_data[user_id]['y'] = list()
user_data[user_id]['x'].append(flattened_img.tolist())
user_data[user_id]['y'].append(user_id)

if (i+1) % status_update_after == 0:
print ("{} images converted".format(i+1))

all_data = dict()
all_data['users'] = list(user_class.keys())
all_data['num_samples'] = [ len(user_data[x]['x']) for x in all_data['users'] ]
all_data['user_data'] = user_data

file_name = 'all_data.json'
file_path = os.path.join(parent_path, 'data', 'all_data', file_name)

with open(file_path, 'w') as outfile:
json.dump(all_data, outfile)
17 changes: 17 additions & 0 deletions data/omniglot/preprocess/data_to_json.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash

# Setup data and raw_data directories, if they don't already exist
if [ ! -d "../data/raw_data" ]; then
mkdir -p ../data/raw_data
fi

# Check and download data if needed
./get_data.sh

if [ ! "$(ls -A ../data/all_data)" ]; then
mkdir -p ../data/all_data
echo "------------------------------"
echo "converting data to .json format"
python3 data_to_json.py
echo "finished converting data to .json format"
fi
18 changes: 18 additions & 0 deletions data/omniglot/preprocess/get_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

DOWNLOAD_URL="https://raw.githubusercontent.com/brendenlake/omniglot/master/python/"
declare -a data_folders=( "images_background" "images_evaluation" )

pushd "../data/raw_data"
echo "------------------------------"
for data_folder in "${data_folders[@]}"; do
if [ ! -d "${data_folder}" ]; then
echo "Downloading ${data_folder}"
wget --no-check-certificate "${DOWNLOAD_URL}/${data_folder}.zip"
unzip "${data_folder}.zip"
rm ${data_folder}.zip
else
echo "Found Omniglot image directory ${data_folder}"
fi
done
popd