PyTorchのtransforms.v2を文字の画像に適用する
Published:
By nobCategory: Posts
文字の画像を変換してみる。変換には
torchvision.transforms.v2
の変換処理を使う。
画像の読み込み
from torchvision.io import read_image
original_image = read_image("data/pism/image/table-1-a-0212.png")
画像表示関数の定義
import matplotlib.pyplot as plt
from torchvision.transforms import v2
from torchvision.transforms.v2 import functional
def imshow(image_tensors):
fig, axs = plt.subplots(1, len(image_tensors) + 1, figsize=(12, 6))
axs[0].set_title("original")
axs[0].imshow(functional.to_pil_image(original_image), cmap=plt.cm.gray)
for i, (title, image_tensor) in enumerate(image_tensors.items(), start=1):
axs[i].set_title(title)
axs[i].imshow(functional.to_pil_image(image_tensor), cmap=plt.cm.gray)
plt.tight_layout()
plt.show()
v2.Pad
指定した余白が追加される。
paddings = [5, 10, 20, 40]
transformed_images = {
padding: v2.Pad(padding=padding)(original_image) for padding in paddings
}
imshow(transformed_images)
v2.Resize
サイズをintで指定すると、画像の短辺の長さが指定したサイズに一致するように拡大縮小される。
sizes = [20, 30, 40, 50]
transformed_images = {
size: v2.Resize(size=size, antialias=None)(original_image)
for size in sizes
}
imshow(transformed_images)
v2.ElasticTransform
see-through-water-like effect
を作り出す。
alphas = [10, 50, 100, 200]
transformed_images = {
alpha: v2.ElasticTransform(alpha=alpha)(original_image) for alpha in alphas
}
imshow(transformed_images)
v2.GaussianBlur
指定した標準偏差のカーネルを使ってガウスぼかしをする。
sigmas = [0.5, 0.8, 1.0, 10.0]
transformed_images = {
sigma: v2.GaussianBlur(kernel_size=(5, 5), sigma=sigma)(original_image)
for sigma in sigmas
}
imshow(transformed_images)
画像の中心を移動する
変換処理を自作してみる。
from torch import nn
class Move(nn.Module):
def __init__(self, dx=0, dy=0):
super().__init__()
self.dx = dx
self.dy = dy
def forward(self, image):
org = functional.to_pil_image(image)
tmp = org.copy()
tmp.paste(0, (0, 0, *org.size))
x, y = 0, 0
left, upper, right, lower = 0, 0, org.size[0], org.size[1]
if self.dx >= 0:
x = self.dx
right = org.size[0] - self.dx
else:
left = -self.dx
if self.dy >= 0:
y = self.dy
lower = org.size[1] - self.dy
else:
upper = -self.dy
tmp.paste(org.crop((left, upper, right, lower)), (x, y))
return functional.pil_to_tensor(tmp)
def __str__(self):
return f"Move(dx={self.dx}, dy={self.dy})"
def __repr__(self):
return f"Move(dx={self.dx}, dy={self.dy})"
deltas = [(0, 0), (5, 0), (5, 5), (0, 5), (-5, 0), (-5, -5)]
transformed_images = {
(dx, dy): Move(dx=dx, dy=dy)(original_image) for dx, dy in deltas
}
imshow(transformed_images)
v2.functional.affine で同じ処理を実現できる。
deltas = [(0, 0), (5, 0), (5, 5), (0, 5), (-5, 0), (-5, -5)]
transformed_images = {
(dx, dy): functional.affine(
original_image, angle=0, scale=1, shear=(0, 0), translate=(dx, dy)
)
for dx, dy in deltas
}
imshow(transformed_images)
移動量が指定した範囲でランダムでよい場合は v2.RandomAffine を使う。
transformed_images = {
i: v2.RandomAffine(degrees=0, translate=(0.1, 0.1))(original_image)
for i in range(6)
}
imshow(transformed_images)
余白を取り除く
import cv2
import numpy as np
from PIL import Image
class Trim(nn.Module):
def forward(self, image):
tmp = functional.to_pil_image(image)
tmp = np.array(tmp)
# 太字にする
dilated = cv2.dilate(tmp, None, iterations=2)
# 文字の輪郭を抽出する
contours, _ = cv2.findContours(
dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
# 文字の輪郭が複数見つかった
# (=文字全体を捉えられていない or 2文字以上含まれる)
if len(contours) > 1:
raise ValueError(
f"more than one contour found. image={tmp} number of contours={len(contours)} contours={contours}"
)
# バウンディングボックスを求める
x, y, w, h = cv2.boundingRect(contours[0])
tmp = tmp[y : y + h, x : x + w]
return functional.pil_to_tensor(Image.fromarray(tmp))
def __str__(self):
return "Trim()"
def __repr__(self):
return "Trim()"
original_images = [
read_image("data/pism/image/table-1-a-0212.png"),
read_image("data/pism/image/table-1-b-0217.png"),
read_image("data/pism/image/table-1-c-0601.png"),
]
transformed_images = [Trim()(image) for image in original_images]
fig = plt.figure(figsize=(6, 6))
for i, (original, transformed) in enumerate(
zip(original_images, transformed_images), start=1
):
ax = fig.add_subplot(len(original_images), 2, 2 * i - 1)
ax.set_title("original")
ax.imshow(functional.to_pil_image(original), cmap=plt.cm.gray)
ax = fig.add_subplot(len(original_images), 2, 2 * i)
ax.set_title("transformed")
ax.imshow(functional.to_pil_image(transformed), cmap=plt.cm.gray)
plt.tight_layout()
plt.show()
画像の大きさを変更する
from PIL import ImageOps
class PillowPad(nn.Module):
def __init__(self, size):
super().__init__()
self.size = size
def forward(self, image):
tmp = functional.to_pil_image(image)
tmp = ImageOps.pad(tmp, self.size, color="#000")
return functional.pil_to_tensor(tmp)
def __str__(self):
return f"PillowPad(size=({self.size[0]}, {self.size[1]}))"
def __repr__(self):
return f"PillowPad(size=({self.size[0]}, {self.size[1]}))"
sizes = [(60, 60), (90, 120), (90, 90), (100, 100)]
transformed_images = {
size: PillowPad(size=size)(original_image) for size in sizes
}
imshow(transformed_images)