**Transfer learning tutorial for NLP enthusiasts, part 1.**

This notebook shows you how to deal with tokenizers (in particular: polish Roberta tokenizer, which we will be using later in this tutorial)

At the beginning:
- install missing libraries
- download models

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 52.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.6 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 71.0 MB/s 
Collecting

In [None]:
import torch, os
from transformers import RobertaModel, AutoModel, PreTrainedTokenizerFast
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset, load_metric
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
#model
!wget https://github.com/sdadas/polish-roberta/releases/download/models-v2/roberta_base_transformers.zip

--2021-09-12 15:19:26--  https://github.com/sdadas/polish-roberta/releases/download/models-v2/roberta_base_transformers.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/247501435/bea4e000-8a5d-11eb-86cc-793bd6e126a7?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210912%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210912T151927Z&X-Amz-Expires=300&X-Amz-Signature=3e96f81cc3c0751a37ea79a200b2dd67611132a3d82e0f5dd82a099d66cfdcb4&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=247501435&response-content-disposition=attachment%3B%20filename%3Droberta_base_transformers.zip&response-content-type=application%2Foctet-stream [following]
--2021-09-12 15:19:27--  https://github-releases.githubusercontent.com/247501435/bea4e000-8a5d-11eb-86cc-793bd6e126a7?X-Amz-Algorithm=AWS4-HMAC-SHA256&X

In [None]:
!mkdir roberta
!unzip roberta_base_transformers.zip -d roberta

Archive:  roberta_base_transformers.zip
  inflating: roberta/config.json     
  inflating: roberta/pytorch_model.bin  
  inflating: roberta/tokenizer.json  


In [None]:
!cat roberta/config.json

{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.4.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50001
}


In [None]:
#load a tokenizer model (here: polish roberta)
model_dir = "./roberta"
rtokenizer = AutoTokenizer.from_pretrained(model_dir)

In [None]:
tokenized = rtokenizer("Chciałbym zażółcić gęślą jaźń, kolego.")
tokenized

{'input_ids': [0, 996, 8, 2790, 3501, 20912, 12, 7908, 25997, 461, 2204, 900, 4, 3933, 74, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenized = rtokenizer("Chciałbym zażółcić gęślą jaźń, kolego.", return_offsets_mapping=True)
tokenized

{'input_ids': [0, 996, 8, 2790, 3501, 20912, 12, 7908, 25997, 461, 2204, 900, 4, 3933, 74, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 9), (10, 11), (11, 13), (13, 15), (15, 18), (19, 20), (19, 21), (21, 24), (25, 27), (27, 28), (28, 29), (29, 30), (31, 34), (34, 37), (37, 38), (0, 0)]}

In [None]:
print(rtokenizer.convert_ids_to_tokens(tokenized['input_ids']))

['<s>', '▁Chciałbym', '▁z', 'aż', 'ół', 'cić', '▁', 'gę', 'ślą', '▁ja', 'ź', 'ń', ',', '▁kol', 'ego', '.', '</s>']


In [None]:
tokenized = rtokenizer("Chciałbym zażółcić gęślą jaźń, kolego.", truncation=True, padding='max_length', max_length=32)
tokenized

{'input_ids': [0, 996, 8, 2790, 3501, 20912, 12, 7908, 25997, 461, 2204, 900, 4, 3933, 74, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
print(rtokenizer.convert_ids_to_tokens(tokenized['input_ids']))

['<s>', '▁Chciałbym', '▁z', 'aż', 'ół', 'cić', '▁', 'gę', 'ślą', '▁ja', 'ź', 'ń', ',', '▁kol', 'ego', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
#try English
tokenized = rtokenizer("I'm experiencing some difficulties.")
tokenized

{'input_ids': [0, 101, 3, 22, 588, 3, 2809, 68, 43516, 29381, 2657, 17470, 172, 1449, 40677, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(rtokenizer.convert_ids_to_tokens(tokenized['input_ids']))

['<s>', '▁I', '<unk>', 'm', '▁e', '<unk>', 'per', 'ie', 'ncing', '▁some', '▁di', 'ffi', 'c', 'ul', 'ties', '.', '</s>']


In [None]:
#Let's try some English tokenization then
rtokenizerE = AutoTokenizer.from_pretrained("roberta-base")
tokenizedE = rtokenizerE("I'm experiencing some difficulties.")
tokenizedE

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'input_ids': [0, 100, 437, 7242, 103, 9282, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(rtokenizerE.convert_ids_to_tokens(tokenizedE['input_ids']))

['<s>', 'I', "'m", 'Ġexperiencing', 'Ġsome', 'Ġdifficulties', '.', '</s>']


In [None]:
#what about tokenizing lists?
rtokenizer2 = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=True)
tokenized2 = rtokenizer2(['Chciałbym','zażółcić','gęślą','jaźń',',','kolego','.'], is_split_into_words=True)
tokenized2

{'input_ids': [0, 996, 8, 2790, 3501, 20912, 12, 7908, 25997, 461, 2204, 900, 12, 4, 3933, 74, 12, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
#problem with spaces before commas and periods
print(rtokenizer.convert_ids_to_tokens(tokenized2['input_ids']))

['<s>', '▁Chciałbym', '▁z', 'aż', 'ół', 'cić', '▁', 'gę', 'ślą', '▁ja', 'ź', 'ń', '▁', ',', '▁kol', 'ego', '▁', '.', '</s>']


In [None]:
rtokenizer.vocab_size

50001

In [None]:
#tokenizing with a basic BERT model
rtokenizerB = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizedB = rtokenizerB("I'm experiencing some difficulties.")
tokenizedB

{'input_ids': [101, 146, 112, 182, 13992, 1199, 7866, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
#no subwords!
print(rtokenizerB.convert_ids_to_tokens(tokenizedB['input_ids']))

['[CLS]', 'I', "'", 'm', 'experiencing', 'some', 'difficulties', '.', '[SEP]']


In [None]:
#tokenize two sentences where two inputs are needed
tokenizedB = rtokenizerB("I'm experiencing some difficulties.","And so I will go to see my doctor.")
tokenizedB

{'input_ids': [101, 146, 112, 182, 13992, 1199, 7866, 119, 102, 1262, 1177, 146, 1209, 1301, 1106, 1267, 1139, 3995, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(rtokenizerB.convert_ids_to_tokens(tokenizedB['input_ids']))

['[CLS]', 'I', "'", 'm', 'experiencing', 'some', 'difficulties', '.', '[SEP]', 'And', 'so', 'I', 'will', 'go', 'to', 'see', 'my', 'doctor', '.', '[SEP]']
