So I have a main folder which contains sub-folders which in turn contains images for the dataset as follows.
-main_db
---CLASS_1
-----img_1
-----img_2
-----img_3
-----img_4
---CLASS_2
-----img_1
-----img_2
-----img_3
-----img_4
---CLASS_3
-----img_1
-----img_2
-----img_3
-----img_4
I need to split this dataset into 2 parts i.e Train data(70%) and Test data(30%). Below is the hierarchy I want to achieve
-main_db
---training_data
-----CLASS_1
-------img_1
-------img_2
-------img_3
-------img_4
---CLASS_2
-------img_1
-------img_2
-------img_3
-------img_4
---testing_data
-----CLASS_1
-------img_5
-------img_6
-------img_7
-------img_8
---CLASS_2
-------img_5
-------img_6
-------img_7
-------img_8
Any help appreciated. Thanks
I have tried this module. But this is not working for me. This module is not being imported at all.
https://github.com/jfilter/split-folders
This is exactly what I want.
This should do it. It will calculate how many images are in each folder and then splits them accordingly, saving test data in a different folder with the same structure.
Save the code in main.py
file and run command:
python3 main.py ----data_path=/path1 --test_data_path_to_save=/path2 --train_ratio=0.7
import shutil
import os
import numpy as np
import argparse
def get_files_from_folder(path):
files = os.listdir(path)
return np.asarray(files)
def main(path_to_data, path_to_test_data, train_ratio):
# get dirs
_, dirs, _ = next(os.walk(path_to_data))
# calculates how many train data per class
data_counter_per_class = np.zeros((len(dirs)))
for i in range(len(dirs)):
path = os.path.join(path_to_data, dirs[i])
files = get_files_from_folder(path)
data_counter_per_class[i] = len(files)
test_counter = np.round(data_counter_per_class * (1 - train_ratio))
# transfers files
for i in range(len(dirs)):
path_to_original = os.path.join(path_to_data, dirs[i])
path_to_save = os.path.join(path_to_test_data, dirs[i])
#creates dir
if not os.path.exists(path_to_save):
os.makedirs(path_to_save)
files = get_files_from_folder(path_to_original)
# moves data
for j in range(int(test_counter[i])):
dst = os.path.join(path_to_save, files[j])
src = os.path.join(path_to_original, files[j])
shutil.move(src, dst)
def parse_args():
parser = argparse.ArgumentParser(description="Dataset divider")
parser.add_argument("--data_path", required=True,
help="Path to data")
parser.add_argument("--test_data_path_to_save", required=True,
help="Path to test data where to save")
parser.add_argument("--train_ratio", required=True,
help="Train ratio - 0.7 means splitting data in 70 % train and 30 % test")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args.data_path, args.test_data_path_to_save, float(args.train_ratio))