From 7b3d15da18a24fdae650bf222bb9340e3fd39492 Mon Sep 17 00:00:00 2001 From: Dmitri Soshnikov <dmitri@soshnikov.com> Date: Wed, 25 May 2022 00:40:51 +0300 Subject: [PATCH] Fix torchtext issue --- .devcontainer/devcontainer.json | 2 + .devcontainer/requirements.txt | 5 +- README.md | 1 + binder/requirements.txt | 5 +- lessons/5-NLP/14-Embeddings/torchnlp.py | 4 +- lessons/5-NLP/16-RNN/torchnlp.py | 4 +- .../GenerativePyTorch.ipynb | 103 ++++++++--------- .../17-GenerativeNetworks/GenerativeTF.ipynb | 9 +- .../5-NLP/17-GenerativeNetworks/torchnlp.py | 104 ++++++++++++++++++ 9 files changed, 166 insertions(+), 71 deletions(-) create mode 100644 lessons/5-NLP/17-GenerativeNetworks/torchnlp.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2f8841c..5c9a324 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,6 +8,8 @@ } }, + "hostRequirements": { "cpus": 4, "memory": "8gb", "storage": "100gb"}, + // Configure tool-specific properties. "customizations": { // Configure properties specific to VS Code. diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 36f009a..935d6db 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -21,7 +21,8 @@ tokenizers==0.10.3 torch==1.11.0 torchaudio torchinfo -torchtext -torchvision +torchtext==0.12.0 +torchvision==0.12.0 +torchdata tqdm==4.62.3 transformers==4.3.3 diff --git a/README.md b/README.md index e54db33..ced7a3b 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [](https://GitHub.com/microsoft/AI-For-Beginners/watchers/) [](https://GitHub.com/microsoft/AI-For-Beginners/network/) [](https://GitHub.com/microsoft/AI-For-Beginners/stargazers/) +[](https://mybinder.org/v2/gh/microsoft/ai-for-beginners/HEAD) # Artificial Intelligence for Beginners - A Curriculum diff --git a/binder/requirements.txt b/binder/requirements.txt index 36f009a..935d6db 100644 --- a/binder/requirements.txt +++ b/binder/requirements.txt @@ -21,7 +21,8 @@ tokenizers==0.10.3 torch==1.11.0 torchaudio torchinfo -torchtext -torchvision +torchtext==0.12.0 +torchvision==0.12.0 +torchdata tqdm==4.62.3 transformers==4.3.3 diff --git a/lessons/5-NLP/14-Embeddings/torchnlp.py b/lessons/5-NLP/14-Embeddings/torchnlp.py index d6ca5e0..cd709f0 100644 --- a/lessons/5-NLP/14-Embeddings/torchnlp.py +++ b/lessons/5-NLP/14-Embeddings/torchnlp.py @@ -20,12 +20,12 @@ def load_dataset(ngrams=1,min_freq=1): counter = collections.Counter() for (label, line) in train_dataset: counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams)) - vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq) + vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) return train_dataset,test_dataset,classes,vocab def encode(x,voc=None,unk=0,tokenizer=tokenizer): v = vocab if voc is None else voc - return [v.stoi.get(s,unk) for s in tokenizer(x)] + return [v.get_stoi().get(s,unk) for s in tokenizer(x)] def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) diff --git a/lessons/5-NLP/16-RNN/torchnlp.py b/lessons/5-NLP/16-RNN/torchnlp.py index d6ca5e0..cd709f0 100644 --- a/lessons/5-NLP/16-RNN/torchnlp.py +++ b/lessons/5-NLP/16-RNN/torchnlp.py @@ -20,12 +20,12 @@ def load_dataset(ngrams=1,min_freq=1): counter = collections.Counter() for (label, line) in train_dataset: counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams)) - vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq) + vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) return train_dataset,test_dataset,classes,vocab def encode(x,voc=None,unk=0,tokenizer=tokenizer): v = vocab if voc is None else voc - return [v.stoi.get(s,unk) for s in tokenizer(x)] + return [v.get_stoi().get(s,unk) for s in tokenizer(x)] def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb index 04a069d..8420b35 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb +++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativePyTorch.ipynb @@ -53,9 +53,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Vocabulary size = 84\n", - "Encoding of 'a' is 4\n", - "Character with code 13 is h\n" + "Vocabulary size = 82\n", + "Encoding of 'a' is 1\n", + "Character with code 13 is c\n" ] } ], @@ -66,12 +66,12 @@ "counter = collections.Counter()\n", "for (label, line) in train_dataset:\n", " counter.update(char_tokenizer(line))\n", - "vocab = torchtext.vocab.Vocab(counter)\n", + "vocab = torchtext.vocab.vocab(counter)\n", "\n", "vocab_size = len(vocab)\n", "print(f\"Vocabulary size = {vocab_size}\")\n", - "print(f\"Encoding of 'a' is {vocab.stoi['a']}\")\n", - "print(f\"Character with code 13 is {vocab.itos[13]}\")" + "print(f\"Encoding of 'a' is {vocab.get_stoi()['a']}\")\n", + "print(f\"Character with code 13 is {vocab.get_itos()[13]}\")" ] }, { @@ -83,23 +83,23 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "tensor([43, 4, 11, 11, 2, 26, 5, 23, 2, 38, 3, 4, 10, 9, 2, 31, 11, 4,\n", - " 21, 2, 38, 4, 14, 25, 2, 34, 8, 5, 6, 2, 5, 13, 3, 2, 38, 11,\n", - " 4, 14, 25, 2, 55, 37, 3, 15, 5, 3, 10, 9, 56, 2, 37, 3, 15, 5,\n", - " 3, 10, 9, 2, 29, 2, 26, 13, 6, 10, 5, 29, 9, 3, 11, 11, 3, 10,\n", - " 9, 27, 2, 43, 4, 11, 11, 2, 26, 5, 10, 3, 3, 5, 58, 9, 2, 12,\n", - " 21, 7, 8, 12, 11, 7, 8, 18, 61, 22, 4, 8, 12, 2, 6, 19, 2, 15,\n", - " 11, 5, 10, 4, 29, 14, 20, 8, 7, 14, 9, 27, 2, 4, 10, 3, 2, 9,\n", - " 3, 3, 7, 8, 18, 2, 18, 10, 3, 3, 8, 2, 4, 18, 4, 7, 8, 23])" + "tensor([ 0, 1, 2, 2, 3, 4, 5, 6, 3, 7, 8, 1, 9, 10, 3, 11, 2, 1,\n", + " 12, 3, 7, 1, 13, 14, 3, 15, 16, 5, 17, 3, 5, 18, 8, 3, 7, 2,\n", + " 1, 13, 14, 3, 19, 20, 8, 21, 5, 8, 9, 10, 22, 3, 20, 8, 21, 5,\n", + " 8, 9, 10, 3, 23, 3, 4, 18, 17, 9, 5, 23, 10, 8, 2, 2, 8, 9,\n", + " 10, 24, 3, 0, 1, 2, 2, 3, 4, 5, 9, 8, 8, 5, 25, 10, 3, 26,\n", + " 12, 27, 16, 26, 2, 27, 16, 28, 29, 30, 1, 16, 26, 3, 17, 31, 3, 21,\n", + " 2, 5, 9, 1, 23, 13, 32, 16, 27, 13, 10, 24, 3, 1, 9, 8, 3, 10,\n", + " 8, 8, 27, 16, 28, 3, 28, 9, 8, 8, 16, 3, 1, 28, 1, 27, 16, 6])" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -128,29 +128,29 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(tensor([[43, 4, 11, ..., 18, 61, 22],\n", - " [ 4, 11, 11, ..., 61, 22, 4],\n", - " [11, 11, 2, ..., 22, 4, 8],\n", + "(tensor([[ 0, 1, 2, ..., 28, 29, 30],\n", + " [ 1, 2, 2, ..., 29, 30, 1],\n", + " [ 2, 2, 3, ..., 30, 1, 16],\n", " ...,\n", - " [37, 3, 15, ..., 4, 18, 4],\n", - " [ 3, 15, 5, ..., 18, 4, 7],\n", - " [15, 5, 3, ..., 4, 7, 8]], device='cuda:0'),\n", - " tensor([[ 4, 11, 11, ..., 61, 22, 4],\n", - " [11, 11, 2, ..., 22, 4, 8],\n", - " [11, 2, 26, ..., 4, 8, 12],\n", + " [20, 8, 21, ..., 1, 28, 1],\n", + " [ 8, 21, 5, ..., 28, 1, 27],\n", + " [21, 5, 8, ..., 1, 27, 16]]),\n", + " tensor([[ 1, 2, 2, ..., 29, 30, 1],\n", + " [ 2, 2, 3, ..., 30, 1, 16],\n", + " [ 2, 3, 4, ..., 1, 16, 26],\n", " ...,\n", - " [ 3, 15, 5, ..., 18, 4, 7],\n", - " [15, 5, 3, ..., 4, 7, 8],\n", - " [ 5, 3, 10, ..., 7, 8, 23]], device='cuda:0'))" + " [ 8, 21, 5, ..., 28, 1, 27],\n", + " [21, 5, 8, ..., 1, 27, 16],\n", + " [ 5, 8, 9, ..., 27, 16, 6]]))" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +216,7 @@ " out, s = net(enc(chars).view(1,-1).to(device))\n", " for i in range(size):\n", " nc = torch.argmax(out[0][-1])\n", - " chars.append(vocab.itos[nc])\n", + " chars.append(vocab.get_itos()[nc])\n", " out, s = net(nc.view(1,-1),s)\n", " return ''.join(chars)" ] @@ -234,35 +234,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Current loss = 4.442246913909912\n", - "today ggrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg\n", - "Current loss = 2.1178359985351562\n", - "today and a could a the to the to the to the to the to the to the to the to the to the to the to the to th\n", - "Current loss = 1.6465336084365845\n", - "today on Tuesday the company to the United States and a policing to the United States and a policing to th\n", - "Current loss = 2.3716814517974854\n", - "today to the United States and a new men to the United States and a new men to the United States and a new\n", - "Current loss = 1.6844098567962646\n", - "today of the first the first the first the first the first the first the first the first the first the fir\n", - "Current loss = 1.702707052230835\n", - "today of the United States a said the United States a said the United States a said the United States a sa\n", - "Current loss = 1.9633255004882812\n", - "today of the first the first the first the first the first the first the first the first the first the fir\n", - "Current loss = 1.8642014265060425\n", - "today of the second a second a second a second a second a second a second a second a second a second a sec\n", - "Current loss = 1.7720613479614258\n", - "today and and and the company of the company of the company of the company of the company of the company o\n", - "Current loss = 1.52818763256073\n", - "today and the company of the company of the company of the company of the company of the company of the co\n", - "Current loss = 1.5444810390472412\n", - "today and the counters to the first the counters to the first the counters to the first the counters to th\n" + "Current loss = 4.398899078369141\n", + "today sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr sr s\n" ] } ], @@ -355,7 +335,7 @@ " #nc = torch.argmax(out[0][-1])\n", " out_dist = out[0][-1].div(temperature).exp()\n", " nc = torch.multinomial(out_dist,1)[0]\n", - " chars.append(vocab.itos[nc])\n", + " chars.append(vocab.get_itos()[nc])\n", " out, s = net(nc.view(1,-1),s)\n", " return ''.join(chars)\n", " \n", @@ -372,10 +352,13 @@ } ], "metadata": { + "interpreter": { + "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f" + }, "kernelspec": { - "display_name": "py37_pytorch", + "display_name": "Python 3.8.12 ('py38')", "language": "python", - "name": "conda-env-py37_pytorch-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -387,7 +370,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb index 2b2e95b..67ebf03 100644 --- a/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb +++ b/lessons/5-NLP/17-GenerativeNetworks/GenerativeTF.ipynb @@ -455,10 +455,13 @@ } ], "metadata": { + "interpreter": { + "hash": "16af2a8bbb083ea23e5e41c7f5787656b2ce26968575d8763f2c4b17f9cd711f" + }, "kernelspec": { - "display_name": "py38_tensorflow", + "display_name": "Python 3.8.12 ('py38')", "language": "python", - "name": "conda-env-py38_tensorflow-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -470,7 +473,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py new file mode 100644 index 0000000..cd709f0 --- /dev/null +++ b/lessons/5-NLP/17-GenerativeNetworks/torchnlp.py @@ -0,0 +1,104 @@ +import builtins +import torch +import torchtext +import collections +import os + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +vocab = None +tokenizer = torchtext.data.utils.get_tokenizer('basic_english') + +def load_dataset(ngrams=1,min_freq=1): + global vocab, tokenizer + print("Loading dataset...") + train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data') + train_dataset = list(train_dataset) + test_dataset = list(test_dataset) + classes = ['World', 'Sports', 'Business', 'Sci/Tech'] + print('Building vocab...') + counter = collections.Counter() + for (label, line) in train_dataset: + counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams)) + vocab = torchtext.vocab.vocab(counter, min_freq=min_freq) + return train_dataset,test_dataset,classes,vocab + +def encode(x,voc=None,unk=0,tokenizer=tokenizer): + v = vocab if voc is None else voc + return [v.get_stoi().get(s,unk) for s in tokenizer(x)] + +def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200): + optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) + loss_fn = loss_fn.to(device) + net.train() + total_loss,acc,count,i = 0,0,0,0 + for labels,features in dataloader: + optimizer.zero_grad() + features, labels = features.to(device), labels.to(device) + out = net(features) + loss = loss_fn(out,labels) #cross_entropy(out,labels) + loss.backward() + optimizer.step() + total_loss+=loss + _,predicted = torch.max(out,1) + acc+=(predicted==labels).sum() + count+=len(labels) + i+=1 + if i%report_freq==0: + print(f"{count}: acc={acc.item()/count}") + if epoch_size and count>epoch_size: + break + return total_loss.item()/count, acc.item()/count + +def padify(b,voc=None,tokenizer=tokenizer): + # b is the list of tuples of length batch_size + # - first element of a tuple = label, + # - second = feature (text sequence) + # build vectorized sequence + v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b] + # compute max length of a sequence in this minibatch + l = max(map(len,v)) + return ( # tuple of two tensors - labels and features + torch.LongTensor([t[0]-1 for t in b]), + torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v]) + ) + +def offsetify(b,voc=None): + # first, compute data tensor from all sequences + x = [torch.tensor(encode(t[1],voc=voc)) for t in b] + # now, compute the offsets by accumulating the tensor of sequence lengths + o = [0] + [len(t) for t in x] + o = torch.tensor(o[:-1]).cumsum(dim=0) + return ( + torch.LongTensor([t[0]-1 for t in b]), # labels + torch.cat(x), # text + o + ) + +def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False): + optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr) + loss_fn = loss_fn.to(device) + net.train() + total_loss,acc,count,i = 0,0,0,0 + for labels,text,off in dataloader: + optimizer.zero_grad() + labels,text = labels.to(device), text.to(device) + if use_pack_sequence: + off = off.to('cpu') + else: + off = off.to(device) + out = net(text, off) + loss = loss_fn(out,labels) #cross_entropy(out,labels) + loss.backward() + optimizer.step() + total_loss+=loss + _,predicted = torch.max(out,1) + acc+=(predicted==labels).sum() + count+=len(labels) + i+=1 + if i%report_freq==0: + print(f"{count}: acc={acc.item()/count}") + if epoch_size and count>epoch_size: + break + return total_loss.item()/count, acc.item()/count + -- GitLab