PDFから表の画像を取り出してDataFrameにする
Published:
By nobCategory: Posts
前提
| software | version |
|---|---|
| Ubuntu | 22.04.4 LTS |
| PyMuPDF | 1.24.9 |
| Tesseract | 5.4.1 |
| tesserocr | 2.7.0 |
| OpenCV | 4.10.0 |
| pandas | 2.2.2 |
| Python | 3.11.9 |
ライブラリのインストール
PyMuPDFのインストール
$ cd [poetryで初期化したプロジェクトディレクトリ]
$ poetry add pymupdf
Tesseractのインストール
標準でインストールされるTesseractが古い(4.1.1)ので、最新版をインストールする
参考:
# add-apt-repository ppa:alex-p/tesseract-ocr5
# apt update
# apt install tesseract-ocr
tesserocrのインストール
# apt install libtesseract-dev
$ cd [作業ディレクトリ]
$ git clone https://github.com/sirfz/tesserocr.git
$ cd tesserocr
$ CPPFLAGS=-march=native python3 setup.py build_ext -I/usr/local/include
$ cd [poetryで初期化したプロジェクトディレクトリ]
$ poetry add [作業ディレクトリ]/tesserocr
OpenCVのインストール
$ cd [poetryで初期化したプロジェクトディレクトリ]
$ poetry add opencv-python
pandasのインストール
$ cd [poetryで初期化したプロジェクトディレクトリ]
$ poetry add pandas
PDFから画像を抽出する
統計数理 第53巻 第2号 ナース・スケジューリング - 調査・モデル化・アルゴリズム - 表1を抽出する。
参考:
画像表示関数の定義
import matplotlib.pyplot as plt
def imshow(image):
fig, ax = plt.subplots(figsize=(9, 9))
ax.imshow(image, cmap=plt.cm.gray)
plt.tight_layout()
plt.show()
画像を抽出する
import io
import fitz
import pymupdf
from PIL import Image, ImageOps
pdf_file_name = "docs/pism/53-2-231.pdf"
page_index = 2
with fitz.open(pdf_file_name) as pdf_file:
page = pdf_file[page_index]
image_metadata = page.get_images()[0]
xref = image_metadata[0]
pdf_image = pdf_file.extract_image(xref)
image_bytes = pdf_image["image"]
image_ext = pdf_image["ext"]
image_size = (pdf_image["width"], pdf_image["height"])
image = Image.open(io.BytesIO(image_bytes))
imshow(image)
チーム毎の勤務表画像を抽出する
参考:
画像のバイト列を
np.ndarray
に変換する
バイト列はそのままではOpenCVがデコードできないようなので
np.ndarray
に変換する。
import cv2
import numpy as np
image_array = np.frombuffer(image_bytes, np.uint8)
print(image_array)
[137 80 78 ... 66 96 130]
OpenCVでバイト列を読み込む
grayscale_image = cv2.imdecode(image_array, cv2.IMREAD_UNCHANGED)
imshow(grayscale_image)
白黒の2値データとして読み込まれた。
print(grayscale_image)
print(grayscale_image[(grayscale_image > 0) & (grayscale_image < 255)])
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
[]
2値データに変換する
2値データとして読み込まれているので2値データへの変換はしないことにする。
binary_image = cv2.threshold(grayscale_image, 20, 255, cv2.THRESH_BINARY)
print(binary_image)
(20.0, array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8))
白黒反転
試しに白黒反転してみる。
inverted_image = cv2.bitwise_not(grayscale_image)
imshow(inverted_image)
線を太くする
輪郭を捉えやすくするために線を太くする。
dilated_image = cv2.dilate(grayscale_image, None, iterations=2)
imshow(dilated_image)
輪郭を抽出する
参考:
contours, hierarchy = cv2.findContours(
dilated_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE
)
contour_image = np.zeros_like(grayscale_image)
cv2.drawContours(contour_image, contours, -1, 255, 2)
imshow(contour_image)
長方形の輪郭を抽出する
チーム毎の勤務表を抽出したいので長方形の輪郭を探す。
rectangle_contours = []
for contour in contours:
perimeter = cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, 0.02 * perimeter, True)
if len(approx) == 4:
rectangle_contours.append(approx)
rectangle_contour_image = np.zeros_like(grayscale_image)
cv2.drawContours(rectangle_contour_image, rectangle_contours, -1, 255, 1)
imshow(rectangle_contour_image)
長方形の輪郭のうち上位3つを抽出する
おそらくチーム毎の勤務表の輪郭が他と比べて大きくなると予想されるので上位3つを抽出する。
max_area = 0
sorted_rectangle_contours = sorted(
rectangle_contours, key=lambda x: cv2.contourArea(x)
)
print([cv2.contourArea(x) for x in sorted_rectangle_contours[-3:]])
top3_rectangle_contour_image = np.zeros_like(grayscale_image)
cv2.drawContours(
top3_rectangle_contour_image, sorted_rectangle_contours[-3:], -1, 255, 1
)
imshow(top3_rectangle_contour_image)
[396750.0, 396864.0, 439901.0]
チームの人数が不均等なので(Aチーム10人・Bチーム9人・Cチーム9人)、Aチームの勤務表の面積が他のチームのものと比べて少し大きくなっている。
勤務表をチーム毎に切り取る
tables = []
for contour in sorted_rectangle_contours[-3:]:
rect = cv2.boundingRect(contour)
print(rect)
table = grayscale_image[
rect[1] : rect[1] + rect[3], rect[0] : rect[0] + rect[2]
]
tables.append(table)
imshow(table)
(0, 734, 1381, 289)
(0, 443, 1381, 289)
(0, 122, 1381, 320)
勤務表画像から文字を抽出する
Aチームの勤務表を抜き出す
table_a = tables[2]
table_b = tables[1]
table_c = tables[0]
target_table_name = "a"
target_table = eval("table_{}".format(target_table_name))
横罫を抽出する
def get_hline_image(table):
kernel = np.ones([1, 6])
image = cv2.erode(table, kernel, iterations=2)
image = cv2.dilate(image, kernel, iterations=100)
return image
hline_image = get_hline_image(target_table)
imshow(hline_image)
カーネルの形を工夫すると
cv2.erode
のiterationが少なく済むようである。
参考:
def get_hline_image(table):
kernel = np.zeros((11, 11), np.uint8)
kernel[5] = 1
image = cv2.erode(table, kernel, iterations=2)
image = cv2.dilate(image, kernel, iterations=60)
return image
hline_image = get_hline_image(target_table)
imshow(hline_image)
縦罫を抽出する
def get_vline_image(table):
kernel = np.zeros((11, 11), np.uint8)
kernel[:, 5] = 1
image = cv2.erode(table, kernel, iterations=2)
image = cv2.dilate(image, kernel, iterations=60)
return image
vline_image = get_vline_image(target_table)
imshow(vline_image)
縦罫と横罫を足し合わせる
def get_line_image(table):
hline = get_hline_image(table)
vline = get_vline_image(table)
image = cv2.add(hline, vline)
return image
line_image = get_line_image(target_table)
imshow(line_image)
罫線を太くする
def get_line_image(table):
hline = get_hline_image(table)
vline = get_vline_image(table)
image = cv2.add(hline, vline)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
return cv2.dilate(image, kernel, iterations=5)
line_image = get_line_image(target_table)
imshow(line_image)
元画像から罫線を取り除く
def remove_line(table, line):
return cv2.subtract(table, line)
line_image = get_line_image(target_table)
image_without_line = remove_line(target_table, line_image)
imshow(image_without_line)
文字認識領域を仮決めする
def get_mask(line):
return cv2.bitwise_not(line)
line_image = get_line_image(target_table)
mask = get_mask(line_image)
imshow(mask)
文字認識領域の輪郭を抽出する
def get_mask_contour(line):
mask = get_mask(line)
contour, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
return contour
line_image = get_line_image(target_table)
mask_contour = get_mask_contour(line_image)
mask_contour_image = np.zeros_like(target_table)
cv2.drawContours(mask_contour_image, mask_contour, -1, 255, 2)
imshow(mask_contour_image)
文字認識領域を長方形にする
参考:
def get_word_bbox(line):
mask_contour = get_mask_contour(line)
bbox = []
for contour in mask_contour:
x, y, w, h = cv2.boundingRect(contour)
bbox.append([x, y, w, h])
return bbox
line_image = get_line_image(target_table)
word_bbox = get_word_bbox(line_image)
word_bbox_image = np.zeros_like(table_a)
for x, y, w, h in word_bbox:
word_bbox_image = cv2.rectangle(
word_bbox_image, (x, y), (x + w, y + h), 255, 2
)
imshow(word_bbox_image)
文字が入らなさそうな領域を取り除く
def get_word_bbox(line):
mask_contour = get_mask_contour(line)
bbox = []
for contour in mask_contour:
x, y, w, h = cv2.boundingRect(contour)
if y > 0 and w > 10 and h > 10:
bbox.append([x, y, w, h])
return bbox
line_image = get_line_image(target_table)
word_bbox = get_word_bbox(line_image)
文字認識領域を行毎にまとめる
各領域の左上のy座標は行毎に概ね等しい値になっていると考えられるので、1つ前の領域と比べてその差が領域の高さの平均値の半分を超えたら改行していると考えることにする。
def get_bbox_rows(line):
word_bbox = get_word_bbox(line)
word_bbox = list(reversed(word_bbox))
bbox_hmean = np.mean([h for x, y, w, h in word_bbox if w > 10 and h > 10])
bbox_hmean_div2 = bbox_hmean / 2
rows = []
cur_row = [word_bbox[0]]
for bbox in word_bbox[1:]:
cur_y = bbox[1]
pre_y = cur_row[-1][1]
dif_y = abs(cur_y - pre_y)
if dif_y <= bbox_hmean_div2:
cur_row.append(bbox)
else:
rows.append(sorted(cur_row, key=lambda x: x[0]))
cur_row = [bbox]
rows.append(sorted(cur_row, key=lambda x: x[0]))
return rows
余白を切り取る
文字の位置が領域によって様々なので、バウンディングボックスを調べて余白を切り取る。
参考:
def get_cropped_bbox_rows(line):
rows = get_bbox_rows(line)
cropped_rows = np.zeros_like(rows)
for ri, row in enumerate(rows):
for ci, col in enumerate(row):
word = table_a[col[1] : col[1] + col[3], col[0] : col[0] + col[2]]
if np.max(word) == 0:
cropped_rows[ri, ci] = [col[0], col[1], 1, 1]
else:
ys = np.any(word, axis=1)
xs = np.any(word, axis=0)
ymin, ymax = np.where(ys)[0][[0, -1]]
xmin, xmax = np.where(xs)[0][[0, -1]]
cropped_rows[ri, ci] = [
col[0] + xmin,
col[1] + ymin,
xmax - xmin + 1,
ymax - ymin + 1,
]
return cropped_rows
今回余白の切り取りはしないことにする。
文字認識領域を描画してみる
cv2.copyMakeBorder
で領域の周囲に均等な幅の余白をつける。
参考:
line_image = get_line_image(target_table)
bbox_rows = get_bbox_rows(line_image)
fig, axes = plt.subplots(len(bbox_rows), len(bbox_rows[0]), figsize=(20, 6))
for ri, row in enumerate(bbox_rows):
for ci, col in enumerate(row):
ax = axes[ri, ci]
ax.set_axis_off()
word = target_table[col[1] : col[1] + col[3], col[0] : col[0] + col[2]]
padding = np.full(4, 5)
padded_word = cv2.copyMakeBorder(
word,
*padding,
cv2.BORDER_CONSTANT,
(0, 0, 0),
)
ax.imshow(padded_word, cmap=plt.cm.gray)
cv2.imwrite(
"data/pism/image/table-1-{}-{:02d}{:02d}.{}".format(
target_table_name, ri, ci, image_ext
),
padded_word,
)
画像から文字を認識する
Tesseractで文字を認識する
OCRには Tesseract を、Pythonのバインディングとして tesserocr を使う。
Python Tesseract というのもあったのだが、Pythonからtesseractコマンドを呼び出して一時ファイル経由で処理しているようだった。
import tesserocr
from tesserocr import OEM, PSM, PyTessBaseAPI
line_image = get_line_image(target_table)
bbox_rows = get_bbox_rows(line_image)
fig, axes = plt.subplots(len(bbox_rows), len(bbox_rows[0]), figsize=(20, 6))
data = []
with PyTessBaseAPI(
path="/usr/share/tesseract-ocr/5/tessdata",
oem=OEM.LSTM_ONLY,
psm=PSM.SINGLE_LINE,
) as api:
api.SetVariable("tessedit_char_whitelist", "nN0123456789-——+/")
for ri, row in enumerate(bbox_rows):
data_row = []
for ci, col in enumerate(row):
ax = axes[ri, ci]
ax.set_axis_off()
word = target_table[
col[1] : col[1] + col[3], col[0] : col[0] + col[2]
]
padding = np.full(4, 5)
padded_word = cv2.copyMakeBorder(
word,
*padding,
cv2.BORDER_CONSTANT,
(0, 0, 0),
)
if np.any(word):
api.SetImage(Image.fromarray(padded_word))
text = api.GetUTF8Text().strip()
if not text:
num_dilate = cv2.dilate(padded_word, None, iterations=1)
api.SetImage(Image.fromarray(num_dilate))
text = api.GetUTF8Text().strip()
if not text:
kernel = np.zeros((5, 5), np.uint8)
kernel[2] = 1
bar_dilate = cv2.dilate(padded_word, kernel, iterations=3)
api.SetImage(Image.fromarray(bar_dilate))
text = api.GetUTF8Text().strip()
else:
text = ""
if text and text in "-——":
text = "-"
data_row.append(text)
ax.set_title(text, fontsize=8)
ax.imshow(padded_word, cmap=plt.cm.gray)
data.append(data_row)
plt.tight_layout()
plt.show()
DataFrameに変換する
Aチームの勤務表を変換する
import pandas as pd
pd.options.display.max_columns = 40
pd.options.display.max_rows = 100
df = pd.DataFrame(data).drop(0, axis=1)
df
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | n | / | / | - | N | n | / | - | / | - | - | + | N | n | / | - | - | N | n | / | - | - | N | n | / | - | / | N | n | / | 9 | 9 | 5 | 1 |
| 1 | 2 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | - | - | 9 | 11 | 5 | 0 |
| 2 | 3 | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | - | / | / | - | - | - | / | - | N | n | / | - | - | / | N | 9 | 12 | 5 | 0 |
| 3 | 4 | / | - | / | / | / | - | - | - | N | n | / | - | - | N | n | / | - | - | - | / | N | n | / | - | - | N | n | / | - | / | 10 | 12 | 4 | 0 |
| 4 | 5 | - | - | N | n | / | - | - | N | n | / | - | - | / | / | N | n | / | / | - | N | n | / | / | / | N | n | / | - | - | - | 10 | 10 | 5 | 0 |
| 5 | 6 | / | - | - | N | n | / | - | - | - | N | n | / | - | - | - | N | n | / | - | - | / | - | / | / | - | - | N | n | / | / | 9 | 13 | 4 | 0 |
| 6 | 7 | - | / | - | - | / | - | + | N | n | / | / | - | N | n | / | / | / | - | N | n | / | - | - | - | / | - | - | - | N | n | 9 | 12 | 4 | 1 |
| 7 | 8 | - | N | n | / | - | + | - | + | N | n | / | - | / | N | n | / | - | - | / | N | n | / | / | / | N | n | / | / | - | - | 10 | 8 | 5 | 2 |
| 8 | 9 | - | / | / | - | N | n | / | / | - | - | / | + | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | 10 | 11 | 4 | 1 |
| 9 | 10 | / | / | - | N | n | / | / | - | - | N | n | / | - | - | - | / | / | N | n | / | / | - | - | - | / | - | - | - | N | n | 10 | 12 | 4 | 0 |
関数化する
Bチーム・Cチームの勤務表で認識できない文字があるため、フィルタを変更する。
参考:
import tesserocr
from tesserocr import OEM, PSM, PyTessBaseAPI
def parse_bbox(api, bbox):
if not np.any(bbox):
return ""
padding = np.full(4, 5)
bbox = cv2.copyMakeBorder(
bbox,
*padding,
cv2.BORDER_CONSTANT,
(0, 0, 0),
)
api.SetVariable("tessedit_char_whitelist", "+/")
api.SetImage(Image.fromarray(bbox))
text = api.GetUTF8Text().strip()
if not text:
tmp = cv2.dilate(bbox, None, iterations=1)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
api.SetVariable("tessedit_char_whitelist", "0123456789")
api.SetImage(Image.fromarray(bbox))
text = api.GetUTF8Text().strip()
if not text:
kernel = np.zeros((3, 3), np.uint8)
kernel[1] = 1
kernel[:, 1] = 1
tmp = cv2.dilate(bbox, kernel, iterations=1)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
tmp = cv2.morphologyEx(bbox, cv2.MORPH_CLOSE, None)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
api.SetVariable("tessedit_char_whitelist", "N")
kernel = np.zeros((5, 5), np.uint8)
kernel[:, 2] = 1
tmp = cv2.dilate(bbox, kernel, iterations=1)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
api.SetVariable("tessedit_char_whitelist", "n")
kernel = np.zeros((3, 3), np.uint8)
kernel[:, 1] = 1
kernel[1] = 1
tmp = cv2.dilate(bbox, kernel=kernel, iterations=1)
tmp = cv2.erode(tmp, None, iterations=1)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
api.SetVariable("tessedit_char_whitelist", "-——")
kernel = np.zeros((5, 5), np.uint8)
kernel[2] = 1
tmp = cv2.erode(bbox, kernel, iterations=2)
tmp = cv2.dilate(bbox, kernel, iterations=4)
api.SetImage(Image.fromarray(tmp))
text = api.GetUTF8Text().strip()
if not text:
return ""
if text in "-——":
text = "-"
return text
def parse_table_image(table, debug=False):
line_image = get_line_image(table)
bbox_rows = get_bbox_rows(line_image)
if debug:
fig, axes = plt.subplots(
len(bbox_rows), len(bbox_rows[0]), figsize=(20, 6)
)
data = []
api = PyTessBaseAPI(
path="/usr/share/tesseract-ocr/5/tessdata",
oem=OEM.LSTM_ONLY,
psm=PSM.SINGLE_LINE,
)
for ri, row in enumerate(bbox_rows):
data_row = []
for ci, col in enumerate(row):
bbox = table[col[1] : col[1] + col[3], col[0] : col[0] + col[2]]
text = parse_bbox(api, bbox)
data_row.append(text)
if debug:
ax = axes[ri, ci]
ax.set_axis_off()
ax.set_title(text, fontsize=13)
ax.imshow(bbox, cmap=plt.cm.gray)
data.append(data_row)
if debug:
plt.tight_layout()
plt.show()
return data
shift_table = pd.DataFrame()
for team, table in zip(["c", "b", "a"], tables):
print("チーム", team.upper())
data = parse_table_image(table, True)
df = pd.DataFrame(data).drop(0, axis=1)
shift_table = pd.concat([shift_table, df], ignore_index=True)
df.to_csv("data/pism/53-2-231-table-1-{}.csv".format(team), index=False)
display(df)
チーム C
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20 | - | - | N | n | / | / | - | N | n | / | - | - | / | - | N | n | / | - | - | - | N | n | / | / | - | N | n | / | / | - | 9 | 11 | 5 | 0 |
| 1 | 21 | - | - | - | N | n | / | - | - | N | n | / | - | - | N | n | / | / | / | N | n | / | / | - | - | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 2 | 22 | - | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | - | - | / | / | - | 10 | 10 | 5 | 0 |
| 3 | 23 | N | n | / | / | - | - | + | / | - | N | n | / | - | - | / | N | n | / | - | / | - | - | / | N | n | / | - | - | N | n | 9 | 10 | 5 | 1 |
| 4 | 24 | - | - | N | n | / | - | + | - | / | / | N | n | / | / | - | - | - | / | - | - | / | - | N | n | / | - | - | N | n | / | 9 | 12 | 4 | 1 |
| 5 | 25 | n | / | / | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | / | - | N | n | / | / | - | - | N | 10 | 10 | 5 | 0 |
| 6 | 26 | / | / | - | - | N | n | + | / | - | - | N | n | / | - | - | / | / | - | - | N | n | / | - | - | N | n | / | / | - | - | 9 | 12 | 4 | 1 |
| 7 | 27 | n | / | / | / | - | - | N | n | / | - | - | + | - | / | - | - | N | n | / | - | + | / | N | n | / | - | - | N | n | / | 9 | 10 | 4 | 2 |
| 8 | 28 | / | / | - | - | / | - | - | - | / | / | - | N | n | / | / | - | - | - | N | n | / | - | / | - | - | - | / | - | - | N | 10 | 15 | 3 | 0 |
チーム B
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11 | / | - | - | N | n | / | / | - | N | n | / | + | - | N | n | / | - | - | - | N | n | / | / | / | - | / | / | - | N | n | 10 | 9 | 5 | 1 |
| 1 | 12 | N | n | / | / | - | - | - | / | - | N | n | / | / | - | - | N | n | / | / | / | / | - | - | - | N | n | / | - | - | - | 10 | 12 | 4 | 0 |
| 2 | 13 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | / | - | - | N | n | / | - | - | / | - | 10 | 10 | 5 | 0 |
| 3 | 14 | - | - | N | n | / | - | - | - | / | - | - | - | / | / | / | / | / | N | n | / | - | - | N | n | / | - | - | / | - | N | 10 | 13 | 4 | 0 |
| 4 | 15 | n | / | / | / | - | - | + | N | n | / | / | - | - | - | N | n | / | - | - | / | N | n | / | - | - | - | / | N | n | / | 10 | 10 | 4 | 1 |
| 5 | 16 | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | / | - | - | - | N | n | / | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 6 | 17 | - | / | - | - | N | n | / | - | / | / | - | + | N | n | / | - | - | - | N | n | / | - | - | - | / | N | n | / | / | / | 10 | 11 | 4 | 1 |
| 7 | 18 | / | - | - | / | / | N | n | / | - | - | / | - | - | + | - | N | n | / | - | - | N | n | / | / | - | N | n | / | - | / | 10 | 11 | 4 | 1 |
| 8 | 19 | - | / | / | - | / | - | - | - | / | / | / | + | - | + | - | - | - | / | - | - | - | / | - | / | - | - | - | / | - | - | 10 | 18 | 0 | 2 |
チーム A
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | n | / | / | - | N | n | / | - | / | - | - | + | N | n | / | - | - | N | n | / | - | - | N | n | / | - | / | N | n | / | 9 | 9 | 5 | 1 |
| 1 | 2 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | - | - | 9 | 11 | 5 | 0 |
| 2 | 3 | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | - | / | / | - | - | - | / | - | N | n | / | - | - | / | N | 9 | 12 | 5 | 0 |
| 3 | 4 | / | - | / | / | / | - | - | - | N | n | / | - | - | N | n | / | - | - | - | / | N | n | / | - | - | N | n | / | - | / | 10 | 12 | 4 | 0 |
| 4 | 5 | - | - | N | n | / | - | - | N | n | / | - | - | / | / | N | n | / | / | - | N | n | / | / | / | N | n | / | - | - | - | 10 | 10 | 5 | 0 |
| 5 | 6 | / | - | - | N | n | / | - | - | - | N | n | / | - | - | - | N | n | / | - | - | / | - | / | / | - | - | N | n | / | / | 9 | 13 | 4 | 0 |
| 6 | 7 | - | / | - | - | / | - | + | N | n | / | / | - | N | n | / | / | / | - | N | n | / | - | - | - | / | - | - | - | N | n | 9 | 12 | 4 | 1 |
| 7 | 8 | - | N | n | / | - | + | - | + | N | n | / | - | / | N | n | / | - | - | / | N | n | / | / | / | N | n | / | / | - | - | 10 | 8 | 5 | 2 |
| 8 | 9 | - | / | / | - | N | n | / | / | - | - | / | + | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | 10 | 11 | 4 | 1 |
| 9 | 10 | / | / | - | N | n | / | / | - | - | N | n | / | - | - | - | / | / | N | n | / | / | - | - | - | / | - | - | - | N | n | 10 | 12 | 4 | 0 |
結果を確認する
shift_table[[1, 32, 33, 34, 35]] = shift_table[[1, 32, 33, 34, 35]].astype(int)
shift_table = shift_table.sort_values(by=1, ignore_index=True)
shift_table
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | n | / | / | - | N | n | / | - | / | - | - | + | N | n | / | - | - | N | n | / | - | - | N | n | / | - | / | N | n | / | 9 | 9 | 5 | 1 |
| 1 | 2 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | - | - | 9 | 11 | 5 | 0 |
| 2 | 3 | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | - | / | / | - | - | - | / | - | N | n | / | - | - | / | N | 9 | 12 | 5 | 0 |
| 3 | 4 | / | - | / | / | / | - | - | - | N | n | / | - | - | N | n | / | - | - | - | / | N | n | / | - | - | N | n | / | - | / | 10 | 12 | 4 | 0 |
| 4 | 5 | - | - | N | n | / | - | - | N | n | / | - | - | / | / | N | n | / | / | - | N | n | / | / | / | N | n | / | - | - | - | 10 | 10 | 5 | 0 |
| 5 | 6 | / | - | - | N | n | / | - | - | - | N | n | / | - | - | - | N | n | / | - | - | / | - | / | / | - | - | N | n | / | / | 9 | 13 | 4 | 0 |
| 6 | 7 | - | / | - | - | / | - | + | N | n | / | / | - | N | n | / | / | / | - | N | n | / | - | - | - | / | - | - | - | N | n | 9 | 12 | 4 | 1 |
| 7 | 8 | - | N | n | / | - | + | - | + | N | n | / | - | / | N | n | / | - | - | / | N | n | / | / | / | N | n | / | / | - | - | 10 | 8 | 5 | 2 |
| 8 | 9 | - | / | / | - | N | n | / | / | - | - | / | + | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | 10 | 11 | 4 | 1 |
| 9 | 10 | / | / | - | N | n | / | / | - | - | N | n | / | - | - | - | / | / | N | n | / | / | - | - | - | / | - | - | - | N | n | 10 | 12 | 4 | 0 |
| 10 | 11 | / | - | - | N | n | / | / | - | N | n | / | + | - | N | n | / | - | - | - | N | n | / | / | / | - | / | / | - | N | n | 10 | 9 | 5 | 1 |
| 11 | 12 | N | n | / | / | - | - | - | / | - | N | n | / | / | - | - | N | n | / | / | / | / | - | - | - | N | n | / | - | - | - | 10 | 12 | 4 | 0 |
| 12 | 13 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | / | - | - | N | n | / | - | - | / | - | 10 | 10 | 5 | 0 |
| 13 | 14 | - | - | N | n | / | - | - | - | / | - | - | - | / | / | / | / | / | N | n | / | - | - | N | n | / | - | - | / | - | N | 10 | 13 | 4 | 0 |
| 14 | 15 | n | / | / | / | - | - | + | N | n | / | / | - | - | - | N | n | / | - | - | / | N | n | / | - | - | - | / | N | n | / | 10 | 10 | 4 | 1 |
| 15 | 16 | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | / | - | - | - | N | n | / | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 16 | 17 | - | / | - | - | N | n | / | - | / | / | - | + | N | n | / | - | - | - | N | n | / | - | - | - | / | N | n | / | / | / | 10 | 11 | 4 | 1 |
| 17 | 18 | / | - | - | / | / | N | n | / | - | - | / | - | - | + | - | N | n | / | - | - | N | n | / | / | - | N | n | / | - | / | 10 | 11 | 4 | 1 |
| 18 | 19 | - | / | / | - | / | - | - | - | / | / | / | + | - | + | - | - | - | / | - | - | - | / | - | / | - | - | - | / | - | - | 10 | 18 | 0 | 2 |
| 19 | 20 | - | - | N | n | / | / | - | N | n | / | - | - | / | - | N | n | / | - | - | - | N | n | / | / | - | N | n | / | / | - | 9 | 11 | 5 | 0 |
| 20 | 21 | - | - | - | N | n | / | - | - | N | n | / | - | - | N | n | / | / | / | N | n | / | / | - | - | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 21 | 22 | - | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | - | - | / | / | - | 10 | 10 | 5 | 0 |
| 22 | 23 | N | n | / | / | - | - | + | / | - | N | n | / | - | - | / | N | n | / | - | / | - | - | / | N | n | / | - | - | N | n | 9 | 10 | 5 | 1 |
| 23 | 24 | - | - | N | n | / | - | + | - | / | / | N | n | / | / | - | - | - | / | - | - | / | - | N | n | / | - | - | N | n | / | 9 | 12 | 4 | 1 |
| 24 | 25 | n | / | / | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | / | - | N | n | / | / | - | - | N | 10 | 10 | 5 | 0 |
| 25 | 26 | / | / | - | - | N | n | + | / | - | - | N | n | / | - | - | / | / | - | - | N | n | / | - | - | N | n | / | / | - | - | 9 | 12 | 4 | 1 |
| 26 | 27 | n | / | / | / | - | - | N | n | / | - | - | + | - | / | - | - | N | n | / | - | + | / | N | n | / | - | - | N | n | / | 9 | 10 | 4 | 2 |
| 27 | 28 | / | / | - | - | / | - | - | - | / | / | - | N | n | / | / | - | - | - | N | n | / | - | / | - | - | - | / | - | - | N | 10 | 15 | 3 | 0 |
answer = pd.read_csv(
"data/pism/53-2-231-table-1.csv",
header=None,
skiprows=1,
names=[i for i in range(1, 36)],
)
answer
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | n | / | / | - | N | n | / | - | / | - | - | + | N | n | / | - | - | N | n | / | - | - | N | n | / | - | / | N | n | / | 9 | 9 | 5 | 1 |
| 1 | 2 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | - | - | 9 | 11 | 5 | 0 |
| 2 | 3 | N | n | / | / | - | N | n | / | - | - | N | n | / | - | - | - | / | / | - | - | - | / | - | N | n | / | - | - | / | N | 9 | 12 | 5 | 0 |
| 3 | 4 | / | - | / | / | / | - | - | - | N | n | / | - | - | N | n | / | - | - | - | / | N | n | / | - | - | N | n | / | - | / | 10 | 12 | 4 | 0 |
| 4 | 5 | - | - | N | n | / | - | - | N | n | / | - | - | / | / | N | n | / | / | - | N | n | / | / | / | N | n | / | - | - | - | 10 | 10 | 5 | 0 |
| 5 | 6 | / | - | - | N | n | / | - | - | - | N | n | / | - | - | - | N | n | / | - | - | / | - | / | / | - | - | N | n | / | / | 9 | 13 | 4 | 0 |
| 6 | 7 | - | / | - | - | / | - | + | N | n | / | / | - | N | n | / | / | / | - | N | n | / | - | - | - | / | - | - | - | N | n | 9 | 12 | 4 | 1 |
| 7 | 8 | - | N | n | / | - | + | - | + | N | n | / | - | / | N | n | / | - | - | / | N | n | / | / | / | N | n | / | / | - | - | 10 | 8 | 5 | 2 |
| 8 | 9 | - | / | / | - | N | n | / | / | - | - | / | + | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | 10 | 11 | 4 | 1 |
| 9 | 10 | / | / | - | N | n | / | / | - | - | N | n | / | - | - | - | / | / | N | n | / | / | - | - | - | / | - | - | - | N | n | 10 | 12 | 4 | 0 |
| 10 | 11 | / | - | - | N | n | / | / | - | N | n | / | + | - | N | n | / | - | - | - | N | n | / | / | / | - | / | / | - | N | n | 10 | 9 | 5 | 1 |
| 11 | 12 | N | n | / | / | - | - | - | / | - | N | n | / | / | - | - | N | n | / | / | / | / | - | - | - | N | n | / | - | - | - | 10 | 12 | 4 | 0 |
| 12 | 13 | / | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | / | - | - | N | n | / | - | - | / | - | 10 | 10 | 5 | 0 |
| 13 | 14 | - | - | N | n | / | - | - | - | / | - | - | - | / | / | / | / | / | N | n | / | - | - | N | n | / | - | - | / | - | N | 10 | 13 | 4 | 0 |
| 14 | 15 | n | / | / | / | - | - | + | N | n | / | / | - | - | - | N | n | / | - | - | / | N | n | / | - | - | - | / | N | n | / | 10 | 10 | 4 | 1 |
| 15 | 16 | N | n | / | - | - | N | n | / | - | - | N | n | / | - | - | / | / | / | - | - | - | N | n | / | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 16 | 17 | - | / | - | - | N | n | / | - | / | / | - | + | N | n | / | - | - | - | N | n | / | - | - | - | / | N | n | / | / | / | 10 | 11 | 4 | 1 |
| 17 | 18 | / | - | - | / | / | N | n | / | - | - | / | - | - | + | - | N | n | / | - | - | N | n | / | / | - | N | n | / | - | / | 10 | 11 | 4 | 1 |
| 18 | 19 | - | / | / | - | / | - | - | - | / | / | / | + | - | + | - | - | - | / | - | - | - | / | - | / | - | - | - | / | - | - | 10 | 18 | 0 | 2 |
| 19 | 20 | - | - | N | n | / | / | - | N | n | / | - | - | / | - | N | n | / | - | - | - | N | n | / | / | - | N | n | / | / | - | 9 | 11 | 5 | 0 |
| 20 | 21 | - | - | - | N | n | / | - | - | N | n | / | - | - | N | n | / | / | / | N | n | / | / | - | - | / | - | N | n | / | / | 10 | 10 | 5 | 0 |
| 21 | 22 | - | N | n | / | - | - | N | n | / | / | - | N | n | / | / | - | N | n | / | / | - | N | n | / | - | - | - | / | / | - | 10 | 10 | 5 | 0 |
| 22 | 23 | N | n | / | / | - | - | + | / | - | N | n | / | - | - | / | N | n | / | - | / | - | - | / | N | n | / | - | - | N | n | 9 | 10 | 5 | 1 |
| 23 | 24 | - | - | N | n | / | - | + | - | / | / | N | n | / | / | - | - | - | / | - | - | / | - | N | n | / | - | - | N | n | / | 9 | 12 | 4 | 1 |
| 24 | 25 | n | / | / | - | - | N | n | / | - | - | / | - | N | n | / | / | - | N | n | / | - | / | - | N | n | / | / | - | - | N | 10 | 10 | 5 | 0 |
| 25 | 26 | / | / | - | - | N | n | + | / | - | - | N | n | / | - | - | / | / | - | - | N | n | / | - | - | N | n | / | / | - | - | 9 | 12 | 4 | 1 |
| 26 | 27 | n | / | / | / | - | - | N | n | / | - | - | + | - | / | - | - | N | n | / | - | + | / | N | n | / | - | - | N | n | / | 9 | 10 | 4 | 2 |
| 27 | 28 | / | / | - | - | / | - | - | - | / | / | - | N | n | / | / | - | - | - | N | n | / | - | / | - | - | - | / | - | - | N | 10 | 15 | 3 | 0 |
正しい勤務表とOCRした勤務表を比較する
answer.compare(shift_table)
文字のフィルタが自動的に作られると便利そうだなと思った。