blog

PyTorchのtransforms.v2を文字の画像に適用する

Published:

By nob

Category: Posts

Tags: 文字認識 Pillow PyTorch Python

文字の画像を変換してみる。変換には torchvision.transforms.v2 の変換処理を使う。

画像の読み込み

from torchvision.io import read_image

original_image = read_image("data/pism/image/table-1-a-0212.png")

画像表示関数の定義

import matplotlib.pyplot as plt
from torchvision.transforms import v2
from torchvision.transforms.v2 import functional


def imshow(image_tensors):
    fig, axs = plt.subplots(1, len(image_tensors) + 1, figsize=(12, 6))
    axs[0].set_title("original")
    axs[0].imshow(functional.to_pil_image(original_image), cmap=plt.cm.gray)
    for i, (title, image_tensor) in enumerate(image_tensors.items(), start=1):
        axs[i].set_title(title)
        axs[i].imshow(functional.to_pil_image(image_tensor), cmap=plt.cm.gray)
    plt.tight_layout()
    plt.show()

v2.Pad

指定した余白が追加される。

paddings = [5, 10, 20, 40]

transformed_images = {
    padding: v2.Pad(padding=padding)(original_image) for padding in paddings
}

imshow(transformed_images)

fig-1

v2.Resize

サイズをintで指定すると、画像の短辺の長さが指定したサイズに一致するように拡大縮小される。

sizes = [20, 30, 40, 50]

transformed_images = {
    size: v2.Resize(size=size, antialias=None)(original_image)
    for size in sizes
}

imshow(transformed_images)

fig-2

v2.ElasticTransform

see-through-water-like effect

を作り出す。

alphas = [10, 50, 100, 200]

transformed_images = {
    alpha: v2.ElasticTransform(alpha=alpha)(original_image) for alpha in alphas
}

imshow(transformed_images)

fig-3

v2.GaussianBlur

指定した標準偏差のカーネルを使ってガウスぼかしをする。

sigmas = [0.5, 0.8, 1.0, 10.0]
transformed_images = {
    sigma: v2.GaussianBlur(kernel_size=(5, 5), sigma=sigma)(original_image)
    for sigma in sigmas
}

imshow(transformed_images)

fig-4

画像の中心を移動する

変換処理を自作してみる。

from torch import nn


class Move(nn.Module):

    def __init__(self, dx=0, dy=0):
        super().__init__()
        self.dx = dx
        self.dy = dy

    def forward(self, image):

        org = functional.to_pil_image(image)

        tmp = org.copy()
        tmp.paste(0, (0, 0, *org.size))

        x, y = 0, 0
        left, upper, right, lower = 0, 0, org.size[0], org.size[1]

        if self.dx >= 0:
            x = self.dx
            right = org.size[0] - self.dx
        else:
            left = -self.dx

        if self.dy >= 0:
            y = self.dy
            lower = org.size[1] - self.dy
        else:
            upper = -self.dy

        tmp.paste(org.crop((left, upper, right, lower)), (x, y))

        return functional.pil_to_tensor(tmp)

    def __str__(self):
        return f"Move(dx={self.dx}, dy={self.dy})"

    def __repr__(self):
        return f"Move(dx={self.dx}, dy={self.dy})"
deltas = [(0, 0), (5, 0), (5, 5), (0, 5), (-5, 0), (-5, -5)]
transformed_images = {
    (dx, dy): Move(dx=dx, dy=dy)(original_image) for dx, dy in deltas
}

imshow(transformed_images)

fig-5

v2.functional.affine で同じ処理を実現できる。

deltas = [(0, 0), (5, 0), (5, 5), (0, 5), (-5, 0), (-5, -5)]
transformed_images = {
    (dx, dy): functional.affine(
        original_image, angle=0, scale=1, shear=(0, 0), translate=(dx, dy)
    )
    for dx, dy in deltas
}

imshow(transformed_images)

fig-6

移動量が指定した範囲でランダムでよい場合は v2.RandomAffine を使う。

transformed_images = {
    i: v2.RandomAffine(degrees=0, translate=(0.1, 0.1))(original_image)
    for i in range(6)
}

imshow(transformed_images)

fig-7

余白を取り除く

import cv2
import numpy as np
from PIL import Image


class Trim(nn.Module):

    def forward(self, image):
        tmp = functional.to_pil_image(image)
        tmp = np.array(tmp)
        # 太字にする
        dilated = cv2.dilate(tmp, None, iterations=2)
        # 文字の輪郭を抽出する
        contours, _ = cv2.findContours(
            dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        # 文字の輪郭が複数見つかった
        # (=文字全体を捉えられていない or 2文字以上含まれる)
        if len(contours) > 1:
            raise ValueError(
                f"more than one contour found. image={tmp} number of contours={len(contours)} contours={contours}"
            )
        # バウンディングボックスを求める
        x, y, w, h = cv2.boundingRect(contours[0])
        tmp = tmp[y : y + h, x : x + w]
        return functional.pil_to_tensor(Image.fromarray(tmp))

    def __str__(self):
        return "Trim()"

    def __repr__(self):
        return "Trim()"
original_images = [
    read_image("data/pism/image/table-1-a-0212.png"),
    read_image("data/pism/image/table-1-b-0217.png"),
    read_image("data/pism/image/table-1-c-0601.png"),
]
transformed_images = [Trim()(image) for image in original_images]

fig = plt.figure(figsize=(6, 6))

for i, (original, transformed) in enumerate(
    zip(original_images, transformed_images), start=1
):
    ax = fig.add_subplot(len(original_images), 2, 2 * i - 1)
    ax.set_title("original")
    ax.imshow(functional.to_pil_image(original), cmap=plt.cm.gray)

    ax = fig.add_subplot(len(original_images), 2, 2 * i)
    ax.set_title("transformed")
    ax.imshow(functional.to_pil_image(transformed), cmap=plt.cm.gray)

plt.tight_layout()
plt.show()

fig-8

画像の大きさを変更する

from PIL import ImageOps


class PillowPad(nn.Module):

    def __init__(self, size):
        super().__init__()
        self.size = size

    def forward(self, image):
        tmp = functional.to_pil_image(image)
        tmp = ImageOps.pad(tmp, self.size, color="#000")
        return functional.pil_to_tensor(tmp)

    def __str__(self):
        return f"PillowPad(size=({self.size[0]}, {self.size[1]}))"

    def __repr__(self):
        return f"PillowPad(size=({self.size[0]}, {self.size[1]}))"
sizes = [(60, 60), (90, 120), (90, 90), (100, 100)]
transformed_images = {
    size: PillowPad(size=size)(original_image) for size in sizes
}

imshow(transformed_images)

fig-9