アンケート結果の分類分け

生成AIで記述欄の分類訳をする。
import json
import re
import time
from typing import Dict, Any, List
import requests
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

OLLAMA_URL = "http://localhost:11434/api/chat"
MODEL = "llama3.1:8b"  # あなたの環境に合わせて

SYSTEM_PROMPT = """あなたはアンケート自由記述を構造化する分析アシスタントです。
必ず次のJSONスキーマで出力してください。余計な文章は禁止です。
- respondent_id: 入力のIDをそのまま
- units: 意味的に独立した意見単位の配列
  - unit_id: 1からの連番
  - unit_text: 原文の表現をできるだけ保持（解釈して言い換えない）
  - label: 3〜10文字程度の中立的な話題名詞（良い/悪い等は禁止）
  - category: 5〜8カテゴリに収まる粒度（例: UI/操作性, 導入・設定, 性能, サポート, 価格, ドキュメント, 信頼性 など）
  - sentiment: -1.0〜+1.0（否定=マイナス、肯定=プラス、中立=0）
  - confidence: 0.0〜1.0（自信度）
出力はJSONのみ。"""

USER_TEMPLATE = """respondent_id: {rid}
text: {text}
"""

def call_ollama(messages: List[Dict[str, str]], temperature: float = 0.2) -> str:
    payload = {
        "model": MODEL,
        "messages": messages,
        "options": {"temperature": temperature},
        "stream": False,
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["message"]["content"]

def extract_json(text: str) -> Dict[str, Any]:
    # 余計な文字が混ざった場合に備えて最初のJSONブロックを抽出
    m = re.search(r"\{.*\}\s*$", text, re.DOTALL)
    if not m:
        raise ValueError("JSONが見つかりません")
    return json.loads(m.group(0))

def analyze_one(rid: str, text: str) -> Dict[str, Any]:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_TEMPLATE.format(rid=rid, text=text)},
    ]
    out = call_ollama(messages)
    return extract_json(out)

# ---- 入力例（CSV想定）----
# df_in = pd.read_csv("responses.csv")  # columns: respondent_id, text
df_in = pd.DataFrame({
    "respondent_id": ["R001","R002"],
    "text": ["操作は直感的だが初期設定が難しい。マニュアルも欲しい。", "処理が遅い。UIは見やすい。"]
})

records = []
for rid, text in tqdm(df_in[["respondent_id","text"]].itertuples(index=False, name=None), total=len(df_in)):
    try:
        obj = analyze_one(rid, text)
        for u in obj["units"]:
            records.append({
                "respondent_id": obj["respondent_id"],
                **u
            })
    except Exception as e:
        records.append({
            "respondent_id": rid,
            "unit_id": None,
            "unit_text": None,
            "label": None,
            "category": "PARSE_ERROR",
            "sentiment": None,
            "confidence": 0.0,
            "error": str(e),
            "raw_text": text
        })
    time.sleep(0.1)  # 連続呼び出しの緩和（必要に応じて）

df = pd.DataFrame(records)

# ---- 可視化1：カテゴリ頻度（棒グラフ）----
cat_counts = df[df["category"] != "PARSE_ERROR"].groupby("category").size().sort_values(ascending=False)
plt.figure()
cat_counts.plot(kind="bar")
plt.title("Category Frequency")
plt.xlabel("category")
plt.ylabel("count")
plt.tight_layout()
plt.show()

# ---- 可視化2：カテゴリごとの平均感情（散布図：頻度×感情）----
agg = df[df["category"] != "PARSE_ERROR"].groupby("category").agg(
    count=("category","size"),
    mean_sent=("sentiment","mean")
).reset_index()

plt.figure()
plt.scatter(agg["mean_sent"], agg["count"])
for _, r in agg.iterrows():
    plt.text(r["mean_sent"], r["count"], r["category"])
plt.title("Sentiment vs Frequency (by category)")
plt.xlabel("mean sentiment")
plt.ylabel("count")
plt.tight_layout()
plt.show()

print(df.head())