表抽出と構造化 - L0 カリキュラム

ストーリー

田

田中VPoE

文書処理で特に難しいのが「表（テーブル）」の抽出だ。罫線があるもの、罫線なしのもの、結合セルがあるものなど、パターンが多い

あなた

確かに、表は見た目は分かりやすいですが、データとして取り出すのは難しそうですね

あ

田

田中VPoE

その通り。表の構造を正確に認識し、行・列・セルの関係を維持したままデータ化するのは、Document AIの中でも最もチャレンジングな領域の一つだ

あなた

請求書の明細表や、報告書の集計表を自動でExcelに変換できたら便利ですよね

あ

田

田中VPoE

まさにそれが目標だ。表の検出から構造化、そしてCSV/Excel出力までの一連の処理を学ぼう

表抽出の課題

表のパターン

パターン	特徴	難易度
罫線あり	明確な罫線で区切られた表	低
罫線なし	スペースやタブで整列された表	中
結合セル	複数セルが結合されたヘッダー等	高
ネスト表	表の中に表がある	高
不規則表	行や列の幅が不均一	高
複数ページ表	ページを跨ぐ表	高

よくある問題

問題1: セルの誤認識
  「東京都 | 港区」が「東京都港区」に結合される

問題2: ヘッダーの誤認識
  複数行ヘッダーのどこまでがヘッダーか不明

問題3: 結合セルの展開
  結合されたセルをどう展開するか

問題4: 空セルの処理
  空欄を null にするか "" にするか

AWS Textractによる表抽出

import boto3
import pandas as pd

def extract_tables_textract(document_path: str) -> list[pd.DataFrame]:
    """AWS Textractで表を抽出してDataFrameに変換"""
    client = boto3.client("textract")

    with open(document_path, "rb") as f:
        document_bytes = f.read()

    response = client.analyze_document(
        Document={"Bytes": document_bytes},
        FeatureTypes=["TABLES"]
    )

    blocks = {b["Id"]: b for b in response["Blocks"]}
    tables = []

    for block in response["Blocks"]:
        if block["BlockType"] == "TABLE":
            table_data = _parse_table(block, blocks)
            df = pd.DataFrame(table_data["rows"], columns=table_data["headers"])
            tables.append(df)

    return tables


def _parse_table(table_block: dict, blocks: dict) -> dict:
    """テーブルブロックをパース"""
    cells = {}
    max_row = 0
    max_col = 0

    # セルを収集
    if "Relationships" in table_block:
        for rel in table_block["Relationships"]:
            if rel["Type"] == "CHILD":
                for cell_id in rel["Ids"]:
                    cell = blocks[cell_id]
                    if cell["BlockType"] == "CELL":
                        row = cell["RowIndex"]
                        col = cell["ColumnIndex"]
                        max_row = max(max_row, row)
                        max_col = max(max_col, col)

                        # セルのテキストを取得
                        text = ""
                        if "Relationships" in cell:
                            for c_rel in cell["Relationships"]:
                                if c_rel["Type"] == "CHILD":
                                    for word_id in c_rel["Ids"]:
                                        word = blocks.get(word_id, {})
                                        if word.get("BlockType") == "WORD":
                                            text += word.get("Text", "") + " "
                        cells[(row, col)] = text.strip()

    # ヘッダーとデータ行に分離
    headers = [cells.get((1, c), f"col_{c}") for c in range(1, max_col + 1)]
    rows = []
    for r in range(2, max_row + 1):
        row = [cells.get((r, c), "") for c in range(1, max_col + 1)]
        rows.append(row)

    return {"headers": headers, "rows": rows}

VLMベースの表抽出

VLMの利点

比較	専用OCR/テーブル検出	VLM
罫線なし表	苦手	得意（文脈理解）
結合セル	要設定	自動認識
ヘッダー判別	ルール必要	自動判別
カスタマイズ	再学習必要	プロンプトで対応
コスト	低	高

VLMによる表抽出の実装

def extract_table_with_vlm(
    image_path: str,
    output_format: str = "json"
) -> dict:
    """VLMで画像内の表を抽出"""
    client = anthropic.Anthropic()

    with open(image_path, "rb") as f:
        image_data = base64.b64encode(f.read()).decode()

    format_instructions = {
        "json": """JSON形式で出力:
{
  "tables": [
    {
      "title": "表のタイトル（あれば）",
      "headers": ["列ヘッダー1", "列ヘッダー2", ...],
      "rows": [
        ["セル1-1", "セル1-2", ...],
        ["セル2-1", "セル2-2", ...]
      ],
      "notes": "表に関する注記"
    }
  ]
}""",
        "csv": "CSV形式で出力してください。ヘッダー行を含め、カンマ区切りで出力。",
        "markdown": "Markdownのテーブル形式で出力してください。"
    }

    prompt = f"""この画像に含まれる全ての表を抽出してください。

注意事項：
- 結合セルは適切に展開してください
- 空のセルは空文字列("")として記録してください
- 数値は単位を分離してください（例: "1,000円" → "1000" + 単位:"円"）
- 表が複数ある場合は全て抽出してください

{format_instructions[output_format]}"""

    response = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": image_data}},
                {"type": "text", "text": prompt}
            ]
        }]
    )

    if output_format == "json":
        return json.loads(response.content[0].text)
    return {"raw": response.content[0].text}

構造化とデータ変換

DataFrameへの変換

def tables_to_dataframes(extraction_result: dict) -> list[pd.DataFrame]:
    """抽出結果をpandas DataFrameに変換"""
    dataframes = []

    for table in extraction_result.get("tables", []):
        headers = table["headers"]
        rows = table["rows"]
        df = pd.DataFrame(rows, columns=headers)

        # データ型の自動判定と変換
        for col in df.columns:
            df[col] = _convert_column_type(df[col])

        dataframes.append(df)

    return dataframes


def _convert_column_type(series: pd.Series) -> pd.Series:
    """列のデータ型を自動変換"""
    # 数値変換を試みる
    try:
        numeric = pd.to_numeric(series.str.replace(",", ""), errors="coerce")
        if numeric.notna().sum() > len(series) * 0.5:
            return numeric
    except (AttributeError, ValueError):
        pass

    # 日付変換を試みる
    try:
        dates = pd.to_datetime(series, errors="coerce")
        if dates.notna().sum() > len(series) * 0.5:
            return dates
    except (ValueError, TypeError):
        pass

    return series

Excel出力

def export_to_excel(
    dataframes: list[pd.DataFrame],
    output_path: str,
    sheet_names: list[str] = None
) -> str:
    """DataFrameリストをExcelファイルに出力"""
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        for i, df in enumerate(dataframes):
            sheet_name = sheet_names[i] if sheet_names and i < len(sheet_names) else f"Table_{i+1}"
            df.to_excel(writer, sheet_name=sheet_name, index=False)

            # 列幅の自動調整
            worksheet = writer.sheets[sheet_name]
            for col_idx, column in enumerate(df.columns, 1):
                max_length = max(
                    len(str(column)),
                    df[column].astype(str).map(len).max()
                )
                worksheet.column_dimensions[
                    chr(64 + col_idx)
                ].width = min(max_length + 2, 50)

    return output_path

複数ページ表の処理

def process_multipage_table(
    page_images: list[str],
    is_continuation: bool = True
) -> pd.DataFrame:
    """複数ページにまたがる表を統合処理"""
    all_tables = []

    for i, image_path in enumerate(page_images):
        result = extract_table_with_vlm(image_path)
        tables = result.get("tables", [])

        if i == 0:
            # 最初のページ: ヘッダーを含む
            all_tables.append(tables[0] if tables else {"headers": [], "rows": []})
        else:
            # 2ページ目以降: ヘッダーを除外して行のみ追加
            if tables and is_continuation:
                all_tables[0]["rows"].extend(tables[0]["rows"])
            elif tables:
                all_tables.extend(tables)

    # DataFrameに変換
    dfs = tables_to_dataframes({"tables": all_tables})
    return dfs[0] if dfs else pd.DataFrame()

まとめ

項目	内容
主な課題	罫線なし表、結合セル、複数ページ表
Textract	罫線あり表に強い、API呼び出しのみ
VLM	罫線なし・結合セルに強い、プロンプトで柔軟対応
構造化出力	JSON → DataFrame → Excel/CSV

チェックリスト

表抽出の主な課題（罫線なし、結合セル等）を理解した
AWS Textractでの表抽出コードを理解した
VLMベースの表抽出の実装パターンを把握した
抽出結果のデータ変換（DataFrame化、Excel出力）を設計できる
複数ページ表の統合処理アプローチを説明できる

推定所要時間: 30分