init

2019-02-01 01:27:38 +11:00
commit 13e908f4df
104 changed files with 102494 additions and 0 deletions
--- a/lib/datasets/LanguageDataset.py
+++ b/lib/datasets/LanguageDataset.py
@@ -0,0 +1,122 @@
+import os
+import torch
+
+from collections import Counter
+
+
+class Dictionary(object):
+  def __init__(self):
+    self.word2idx = {}
+    self.idx2word = []
+    self.counter = Counter()
+    self.total = 0
+
+  def add_word(self, word):
+    if word not in self.word2idx:
+      self.idx2word.append(word)
+      self.word2idx[word] = len(self.idx2word) - 1
+    token_id = self.word2idx[word]
+    self.counter[token_id] += 1
+    self.total += 1
+    return self.word2idx[word]
+
+  def __len__(self):
+    return len(self.idx2word)
+
+
+class Corpus(object):
+  def __init__(self, path):
+    self.dictionary = Dictionary()
+    self.train = self.tokenize(os.path.join(path, 'train.txt'))
+    self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+    self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+  def tokenize(self, path):
+    """Tokenizes a text file."""
+    assert os.path.exists(path)
+    # Add words to the dictionary
+    with open(path, 'r', encoding='utf-8') as f:
+      tokens = 0
+      for line in f:
+        words = line.split() + ['<eos>']
+        tokens += len(words)
+        for word in words:
+          self.dictionary.add_word(word)
+
+    # Tokenize file content
+    with open(path, 'r', encoding='utf-8') as f:
+      ids = torch.LongTensor(tokens)
+      token = 0
+      for line in f:
+        words = line.split() + ['<eos>']
+        for word in words:
+          ids[token] = self.dictionary.word2idx[word]
+          token += 1
+
+    return ids
+
+class SentCorpus(object):
+  def __init__(self, path):
+    self.dictionary = Dictionary()
+    self.train = self.tokenize(os.path.join(path, 'train.txt'))
+    self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
+    self.test = self.tokenize(os.path.join(path, 'test.txt'))
+
+  def tokenize(self, path):
+    """Tokenizes a text file."""
+    assert os.path.exists(path)
+    # Add words to the dictionary
+    with open(path, 'r', encoding='utf-8') as f:
+      tokens = 0
+      for line in f:
+        words = line.split() + ['<eos>']
+        tokens += len(words)
+        for word in words:
+          self.dictionary.add_word(word)
+
+    # Tokenize file content
+    sents = []
+    with open(path, 'r', encoding='utf-8') as f:
+      for line in f:
+        if not line:
+          continue
+        words = line.split() + ['<eos>']
+        sent = torch.LongTensor(len(words))
+        for i, word in enumerate(words):
+          sent[i] = self.dictionary.word2idx[word]
+        sents.append(sent)
+
+    return sents
+
+class BatchSentLoader(object):
+  def __init__(self, sents, batch_size, pad_id=0, cuda=False, volatile=False):
+    self.sents = sents
+    self.batch_size = batch_size
+    self.sort_sents = sorted(sents, key=lambda x: x.size(0))
+    self.cuda = cuda
+    self.volatile = volatile
+    self.pad_id = pad_id
+
+  def __next__(self):
+    if self.idx >= len(self.sort_sents):
+      raise StopIteration
+
+    batch_size = min(self.batch_size, len(self.sort_sents)-self.idx)
+    batch = self.sort_sents[self.idx:self.idx+batch_size]
+    max_len = max([s.size(0) for s in batch])
+    tensor = torch.LongTensor(max_len, batch_size).fill_(self.pad_id)
+    for i in range(len(batch)):
+      s = batch[i]
+      tensor[:s.size(0),i].copy_(s)
+    if self.cuda:
+      tensor = tensor.cuda()
+
+    self.idx += batch_size
+
+    return tensor
+  
+  next = __next__
+
+  def __iter__(self):
+    self.idx = 0
+    return self
--- a/lib/datasets/MetaBatchSampler.py
+++ b/lib/datasets/MetaBatchSampler.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+import numpy as np
+import torch
+
+
+class MetaBatchSampler(object):
+
+  def __init__(self, labels, classes_per_it, num_samples, iterations):
+    '''
+    Initialize MetaBatchSampler
+    Args:
+    - labels: an iterable containing all the labels for the current dataset
+    samples indexes will be infered from this iterable.
+    - classes_per_it: number of random classes for each iteration
+    - num_samples: number of samples for each iteration for each class (support + query)
+    - iterations: number of iterations (episodes) per epoch
+    '''
+    super(MetaBatchSampler, self).__init__()
+    self.labels           = labels.copy()
+    self.classes_per_it   = classes_per_it
+    self.sample_per_class = num_samples
+    self.iterations       = iterations
+
+    self.classes, self.counts = np.unique(self.labels, return_counts=True)
+    assert len(self.classes) == np.max(self.classes) + 1 and np.min(self.classes) == 0
+    assert classes_per_it < len(self.classes), '{:} vs. {:}'.format(classes_per_it, len(self.classes))
+    self.classes = torch.LongTensor(self.classes)
+
+    # create a matrix, indexes, of dim: classes X max(elements per class)
+    # fill it with nans
+    # for every class c, fill the relative row with the indices samples belonging to c
+    # in numel_per_class we store the number of samples for each class/row
+    self.indexes = { x.item() : [] for x in self.classes }
+    indexes = { x.item() : [] for x in self.classes }
+
+    for idx, label in enumerate(self.labels):
+      indexes[ label.item() ].append( idx )
+    for key, value in indexes.items():
+      self.indexes[ key ] = torch.LongTensor( value )
+
+
+  def __iter__(self):
+    # yield a batch of indexes
+    spc = self.sample_per_class
+    cpi = self.classes_per_it
+
+    for it in range(self.iterations):
+      batch_size = spc * cpi
+      batch = torch.LongTensor(batch_size)
+      assert cpi < len(self.classes), '{:} vs. {:}'.format(cpi, len(self.classes))
+      c_idxs = torch.randperm(len(self.classes))[:cpi]
+
+      for i, cls in enumerate(self.classes[c_idxs]):
+        s = slice(i * spc, (i + 1) * spc)
+        num = self.indexes[ cls.item() ].nelement()
+        assert spc < num, '{:} vs. {:}'.format(spc, num)
+        sample_idxs = torch.randperm( num )[:spc]
+        batch[s] = self.indexes[ cls.item() ][sample_idxs]
+
+      batch = batch[torch.randperm(len(batch))]
+      yield batch
+
+  def __len__(self):
+    # returns the number of iterations (episodes) per epoch
+    return self.iterations
--- a/lib/datasets/TieredImageNet.py
+++ b/lib/datasets/TieredImageNet.py
@@ -0,0 +1,84 @@
+from __future__ import print_function
+import numpy as np
+from PIL import Image
+import pickle as pkl
+import os, cv2, csv, glob
+import torch
+import torch.utils.data as data
+
+
+class TieredImageNet(data.Dataset):
+
+  def __init__(self, root_dir, split, transform=None):
+    self.split = split
+    self.root_dir = root_dir
+    self.transform = transform
+    splits = split.split('-')
+
+    images, labels, last = [], [], 0
+    for split in splits:
+      labels_name = '{:}/{:}_labels.pkl'.format(self.root_dir, split)
+      images_name = '{:}/{:}_images.npz'.format(self.root_dir, split)
+      # decompress images if npz not exits
+      if not os.path.exists(images_name):
+        png_pkl = images_name[:-4] + '_png.pkl'
+        if os.path.exists(png_pkl):
+          decompress(images_name, png_pkl)
+        else:
+          raise ValueError('png_pkl {:} not exits'.format( png_pkl ))
+      assert os.path.exists(images_name) and os.path.exists(labels_name), '{:} & {:}'.format(images_name, labels_name)
+      print ("Prepare {:} done".format(images_name))
+      try:
+        with open(labels_name) as f:
+          data = pkl.load(f)
+          label_specific = data["label_specific"]
+      except:
+        with open(labels_name, 'rb') as f:
+          data = pkl.load(f, encoding='bytes')
+          label_specific = data[b'label_specific']
+      with np.load(images_name, mmap_mode="r", encoding='latin1') as data:
+        image_data = data["images"]
+      images.append( image_data )
+      label_specific = label_specific + last
+      labels.append( label_specific )
+      last = np.max(label_specific) + 1
+      print ("Load {:} done, with image shape = {:}, label shape = {:}, [{:} ~ {:}]".format(images_name, image_data.shape, label_specific.shape, np.min(label_specific), np.max(label_specific)))
+    images, labels = np.concatenate(images), np.concatenate(labels)
+
+    self.images = images
+    self.labels = labels
+    self.n_classes = int( np.max(labels) + 1 )
+    self.dict_index_label = {}
+    for cls in range(self.n_classes):
+      idxs = np.where(labels==cls)[0]
+      self.dict_index_label[cls] = idxs
+    self.length = len(labels)
+    print ("There are {:} images, {:} labels [{:} ~ {:}]".format(images.shape, labels.shape, np.min(labels), np.max(labels)))
+  
+
+  def __repr__(self):
+    return ('{name}(length={length}, classes={n_classes})'.format(name=self.__class__.__name__, **self.__dict__))
+
+  def __len__(self):
+    return self.length
+
+  def __getitem__(self, index):
+    assert index >= 0 and index < self.length, 'invalid index = {:}'.format(index)
+    image = self.images[index].copy()
+    label = int(self.labels[index])
+    image = Image.fromarray(image[:,:,::-1].astype('uint8'), 'RGB')
+    if self.transform is not None:
+      image = self.transform( image )
+    return image, label
+
+
+
+
+def decompress(path, output):
+  with open(output, 'rb') as f:
+    array = pkl.load(f, encoding='bytes')
+  images = np.zeros([len(array), 84, 84, 3], dtype=np.uint8)
+  for ii, item in enumerate(array):
+    im = cv2.imdecode(item, 1)
+    images[ii] = im
+  np.savez(path, images=images)
--- a/lib/datasets/init.py
+++ b/lib/datasets/init.py
@@ -0,0 +1,3 @@
+from .MetaBatchSampler import MetaBatchSampler
+from .TieredImageNet import TieredImageNet
+from .LanguageDataset import Corpus
--- a/lib/datasets/test_NLP.py
+++ b/lib/datasets/test_NLP.py
@@ -0,0 +1,10 @@
+import os, sys, torch
+
+from LanguageDataset import SentCorpus, BatchSentLoader
+
+if __name__ == '__main__':
+  path = '../../data/data/penn'
+  corpus = SentCorpus( path )
+  loader = BatchSentLoader(corpus.test, 10)
+  for i, d in enumerate(loader):
+    print('{:} :: {:}'.format(i, d.size()))
--- a/lib/datasets/test_dataset.py
+++ b/lib/datasets/test_dataset.py
@@ -0,0 +1,33 @@
+import os, sys, torch
+import torchvision.transforms as transforms
+
+from TieredImageNet import TieredImageNet
+from MetaBatchSampler import MetaBatchSampler
+
+root_dir = os.environ['TORCH_HOME'] + '/tiered-imagenet'
+print ('root : {:}'.format(root_dir))
+means, stds = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
+
+lists = [transforms.RandomHorizontalFlip(), transforms.RandomCrop(84, padding=8), transforms.ToTensor(), transforms.Normalize(means, stds)]
+transform = transforms.Compose(lists)
+
+dataset = TieredImageNet(root_dir, 'val-test', transform)
+image, label = dataset[111]
+print ('image shape = {:}, label = {:}'.format(image.size(), label))
+print ('image : min = {:}, max = {:}    ||| label : {:}'.format(image.min(), image.max(), label))
+
+
+sampler = MetaBatchSampler(dataset.labels, 250, 100, 10)
+
+dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler)
+
+print ('the length of dataset : {:}'.format( len(dataset) ))
+print ('the length of loader  : {:}'.format( len(dataloader) ))
+
+for images, labels in dataloader:
+  print ('images : {:}'.format( images.size() ))
+  print ('labels : {:}'.format( labels.size() ))
+  for i in range(3):
+    print ('image-value-[{:}] : {:} ~ {:}, mean={:}, std={:}'.format(i, images[:,i].min(), images[:,i].max(), images[:,i].mean(), images[:,i].std()))
+
+print('-----')