Source code for nlp_data_py.dataset.command_line

import argparse
from nlp_data_py.commons.bookdef import Book
from nlp_data_py.commons.splitter import Splitter
from nlp_data_py.dataset.wiki import WikiDataset
from nlp_data_py.dataset.constants import *

[docs]def wiki_dataset(): parser = argparse.ArgumentParser(prog='nlp_data_py') parser.add_argument('-s', '--seed', nargs='*', default=[], help=seed_help) parser.add_argument('-m', '--match', type=str, default="", help=match_help) parser.add_argument('-r', '--recursive', type=str2bool, default=True, help=recursive_help) parser.add_argument('-l', '--limit', type=int, default=20, help=limit_help) parser.add_argument('-p', '--pickle', type=str, default="./vars/scanned.pkl", help=pickle_help) parser.add_argument('-o', '--output', type=str, default="./vars/", help=output_help) parser.add_argument('-cs', '--chunk_splitter', type=str, default='(?<=[.!?]) +', help=chunk_splitter_help) parser.add_argument('-cp', '--chunks_per_page', type=int, default=5, help=chunks_per_page_help) parser.add_argument('-sr', '--split_ratio', nargs="*", type=float, default=[0.8, 0.1, 0.1], help=split_ratio_help) parser.add_argument('-ds', '--datasets', nargs="*", default=["train", "val", "test"], help=datasets_help) parser.add_argument('-sf', '--shuffle', type=str2bool, default=True, help=shuffle_help) args = parser.parse_args() ins = {'seed': args.seed, 'match': args.match, 'recursive': args.recursive, 'limit': args.limit, 'pickle': args.pickle, 'output': args.output, 'chunk_splitter': args.chunk_splitter, 'chunks_per_page': args.chunks_per_page, 'split_ratio': args.split_ratio, 'datasets': args.datasets, 'shuffle': args.shuffle } print(f"Ins: {ins}") wiki = WikiDataset.create_dataset_from_wiki(seeds=ins['seed'], match=ins['match'], recursive=ins['recursive'], limit=ins['limit'], scanned_pickle=ins['pickle'], save_dataset_path=ins['output'], book_def=Book(ins['chunk_splitter'], ins['chunks_per_page']), splitter=Splitter(ins['split_ratio'], ins['datasets'], ins['shuffle'])) print(wiki.scanned)
[docs]def str2bool(v): if v.lower() in ('yes', 'true', 't', 'y', '1'): return True elif v.lower() in ('no', 'false', 'f', 'n', '0'): return False else: raise argparse.ArgumentTypeError('Boolean value expected.')