{ "data": [ { "title": "Terremoto del Sichuan del 2008", "paragraphs": [ { "context": "Il terremoto del Sichuan del 2008 o il terremoto del Gran Sichuan, misurato a 8.0 Ms e 7.9 Mw, e si è verificato alle 02:28:01 PM China Standard Time all' epicentro (06:28:01 UTC) il 12 maggio nella provincia del Sichuan, ha ucciso 69.197 persone e lasciato 18.222 dispersi.", "qas": [ { "id": "56cdca7862d2951400fa6826", "answers": [ { "text": "2008", "answer_start": 29 } ], "question": "In quale anno si è verificato il terremoto nel Sichuan?" }, ...
>>> squad_it_dataset["train"][0] { "title": "Terremoto del Sichuan del 2008", "paragraphs": [ { "context": "Il terremoto del Sichuan del 2008 o il terremoto...", "qas": [ { "answers": [{"answer_start": 29, "text": "2008"}], "id": "56cdca7862d2951400fa6826", "question": "In quale anno si è verificato il terremoto nel Sichuan?", }, ...
>>> raw_datasets = load_dataset("glue", "mrpc") Found cached dataset glue (/Users/harry/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad) 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 375.89it/s] >>> sentence_sample = raw_datasets['train'].shuffle(seed=42).select(range(100)) >>> sentence_sample[:3] {'sentence1': ['" The public is understandably losing patience with these unwanted phone calls , unwanted intrusions , " he said at a White House ceremony .', 'Federal agent Bill Polychronopoulos said it was not known if the man , 30 , would be charged .', 'The companies uniformly declined to give specific numbers on customer turnover , saying they will release those figures only when they report overall company performance at year-end .'], 'sentence2': ['" While many good people work in the telemarketing industry , the public is understandably losing patience with these unwanted phone calls , unwanted intrusions , " Mr. Bush said .', 'Federal Agent Bill Polychronopoulos said last night the man involved in the Melbourne incident had been unarmed .', 'The companies , however , declined to give specifics on customer turnover , saying they would release figures only when they report their overall company performance .'], 'label': [0, 0, 1], 'idx': [3946, 3683, 3919]} >>> sentence_sample = raw_datasets['train'] >>> print(sentence_sample) Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 3668 }) >>> sentence_sample = sentence_sample.filter(lambda x: x["sentence1"][0]=="\"") >>> print(sentence_sample) Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 343 }) >>> sentence_sample[:3] {'sentence1': ['" I think you \'ll see a lot of job growth in the next two years , " he said , adding the growth could replace jobs lost .', '" The result is an overall package that will provide significant economic growth for our employees over the next four years . "', '" We are declaring war on sexual harassment and sexual assault .'], 'sentence2': ['" I think you \'ll see a lot of job growth in the next two years , " said Mankiw .', '" The result is an overall package that will provide a significant economic growth for our employees over the next few years , " he said .', '" We have declared war on sexual assault and sexual harassment , " Rosa said .'], 'label': [0, 1, 1], 'idx': [20, 49, 89]} >>>
>>> sentence_sample = raw_datasets['train'] >>> defupper_sentence(example): ... return example["sentence1"] = example["sentence1"].upper() ... >>> sentence_sample = sentence_sample.map(upper_sentence) >>> sentence_sample[0] {'sentence1': 'AMROZI ACCUSED HIS BROTHER , WHOM HE CALLED " THE WITNESS " , OF DELIBERATELY DISTORTING HIS EVIDENCE .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0} >>> raw_datasets['train']['sentence1'][0] 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .' >>> sentence_sample['sentence1'][0] 'AMROZI ACCUSED HIS BROTHER , WHOM HE CALLED " THE WITNESS " , OF DELIBERATELY DISTORTING HIS EVIDENCE .'
BuilderConfig(name="first_domain", version=VERSION, description="This part of my dataset covers a first domain") BuilderConfig(name="second_domain", version=VERSION, description="This part of my dataset covers a second domain"),
classSuperGlueConfig(datasets.BuilderConfig): """BuilderConfig for SuperGLUE."""
def__init__(self, features, data_url, citation, url, label_classes=("False", "True"), **kwargs): """BuilderConfig for SuperGLUE. Args: features: *list[string]*, list of the features that will appear in the feature dict. Should not include "label". data_url: *string*, url to download the zip file from. citation: *string*, citation for the data set. url: *string*, url for information about the data set. label_classes: *list[string]*, the list of classes for the label if the label is present as a string. Non-string labels will be cast to either 'False' or 'True'. **kwargs: keyword arguments forwarded to super. """ # Version history: # 1.0.2: Fixed non-nondeterminism in ReCoRD. # 1.0.1: Change from the pre-release trial version of SuperGLUE (v1.9) to # the full release (v2.0). # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.0.2: Initial version. super().__init__(version=datasets.Version("1.0.2"), **kwargs) self.features = features self.label_classes = label_classes self.data_url = data_url self.citation = citation self.url = url
file_path = os.path.join(ann_dir, file) withopen(file_path, "r", encoding="utf8") as f: data = json.load(f) image_path = os.path.join(img_dir, file) image_path = image_path.replace("json", "png") image, size = load_image(image_path) for item in data["form"]: words, label = item["words"], item["label"] words = [w for w in words if w["text"].strip() != ""] iflen(words) == 0: continue if label == "other": for w in words: tokens.append(w["text"]) ner_tags.append("O") bboxes.append(normalize_bbox(w["box"], size)) else: tokens.append(words[0]["text"]) ner_tags.append("B-" + label.upper()) bboxes.append(normalize_bbox(words[0]["box"], size)) for w in words[1:]: tokens.append(w["text"]) ner_tags.append("I-" + label.upper()) bboxes.append(normalize_bbox(w["box"], size))