Source code for nlp_data_py.commons.splitter

from typing import List
import math
from nlp_data_py.commons.utils.helpers import Helpers
from nlp_data_py.commons.utils.logging import Logging
from logging import Logger


[docs]class Splitter: """Splits pages in a book to datasets. This class will simple determine what page numbers make each datasets. Args: num_of_pages: Book. split_ratios: ratio to split the book. Default ratio is 90% train, 5% val and 5% test dataset_names: dataset names to be split to shuffle: shuffle pages Properties: ds_to_pages: Contains the dict of datasets and page number in each of the datasets. Example: :: splitter: Splitter = Splitter(split_ratios=[0.8, 0.1, 0.1], dataset_names=['train', 'val', 'test'], shuffle=True) splitter.num_of_pages = 10 print(splitter.shuffled_pages) >>> [4, 3, 1, 0, 8, 6, 9, 7, 2, 5] print(splitter.ds_to_page) >>> { 'train': [4, 3, 1, 0, 8, 6, 9, 7] 'val': [2] 'test': [5] } """ logger: Logger = Logging.get_logger("SplitBook") def __init__(self, split_ratios: List[float]=[0.8, 0.1, 0.1], dataset_names: List[str]=['train', 'val', 'test'], shuffle=True): self.split_ratios = split_ratios self.dataset_names = dataset_names self.shuffle = shuffle #self.num_of_pages = 0 @property def num_of_pages(self): """Number of pages for splitting. Once num_of_pages is set ds_to_page dict will be availabe. ds_to_pages: Contains the dict of datasets and page number in each of the datasets. """ return self.__num_of_pages @num_of_pages.setter def num_of_pages(self, numberofpages): self.__num_of_pages = numberofpages self.shuffled_pages = self.__num_of_pages self.ds_to_pages = self.pages_to_datasets() self.logger.debug(f"ds_to_pages: {self.ds_to_pages}") @property def shuffled_pages(self): """List of shuffled page number if shuffle is true, else just ordered page numbers. """ return self.__shuffled_pages @shuffled_pages.setter def shuffled_pages(self, num_of_pages): if self.shuffle: self.__shuffled_pages = Helpers.generate_random_shuffle(num_of_pages) else: self.__shuffled_pages = list(range(0, num_of_pages))
[docs] @staticmethod def match_splitratios_and_datasetnames(split_ratios=[], dataset_names=[]): """If parameters passed to split and datasets are not even, this expands the shorter one. If the dataset_name is shorter, it creates default dataset name as 'set_{position of missing item}. If ratio is shorter its set to 0 and no pages for it are created Args: split_ratios: list of ratios for pages dataset_names: list of names for the datasets Returns: Normalized ratio and datasetnames """ if len(split_ratios) == len(dataset_names) == 0: split_ratios = [1] dataset_names = ['train'] Helpers.extend_shorter_list(split_ratios, dataset_names, 0) dataset_names = ["set_" + str(i) if ds == 0 else ds for i, ds in enumerate(dataset_names)] return Helpers.normalize_ratios(split_ratios), dataset_names
[docs] def pages_to_datasets(self): """creates a dict of dataset names and page numbers. Example: :: This returns somethings like { "train": [0, 1, 4, 8, 9, 3, 6] "val" : [2, 5] "test": [7] } In the above example, train set will contain pages in its list and so on for val and test """ self.split_ratios, self.dataset_names = Splitter.match_splitratios_and_datasetnames(self.split_ratios, self.dataset_names) num_of_pages = len(self.shuffled_pages) pages_per_ds = [math.ceil(num_of_pages * r) for r in self.split_ratios] self.logger.debug(f"pages_per_ds: {pages_per_ds}") start = 0 pages_to_ds = {} for p, d in zip(pages_per_ds, self.dataset_names): end = max(0, min(start + p, num_of_pages)) pages_to_ds[d] = self.shuffled_pages[start:end] start = max(0, min(start + p, num_of_pages - 1)) return pages_to_ds