アンケート結果の分類分け

生成AIで記述欄の分類訳をする。

import json
import re
import time
from typing import Dict, Any, List
import requests
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

OLLAMA_URL = "http://localhost:11434/api/chat"
MODEL = "llama3.1:8b"  # あなたの環境に合わせて

SYSTEM_PROMPT = """あなたはアンケート自由記述を構造化する分析アシスタントです。
必ず次のJSONスキーマで出力してください。余計な文章は禁止です。
- respondent_id: 入力のIDをそのまま
- units: 意味的に独立した意見単位の配列
  - unit_id: 1からの連番
  - unit_text: 原文の表現をできるだけ保持（解釈して言い換えない）
  - label: 3〜10文字程度の中立的な話題名詞（良い/悪い等は禁止）
  - category: 5〜8カテゴリに収まる粒度（例: UI/操作性, 導入・設定, 性能, サポート, 価格, ドキュメント, 信頼性 など）
  - sentiment: -1.0〜+1.0（否定=マイナス、肯定=プラス、中立=0）
  - confidence: 0.0〜1.0（自信度）
出力はJSONのみ。"""

USER_TEMPLATE = """respondent_id: {rid}
text: {text}
"""

def call_ollama(messages: List[Dict[str, str]], temperature: float = 0.2) -> str:
    payload = {
        "model": MODEL,
        "messages": messages,
        "options": {"temperature": temperature},
        "stream": False,
    }
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["message"]["content"]

def extract_json(text: str) -> Dict[str, Any]:
    # 余計な文字が混ざった場合に備えて最初のJSONブロックを抽出
    m = re.search(r"\{.*\}\s*$", text, re.DOTALL)
    if not m:
        raise ValueError("JSONが見つかりません")
    return json.loads(m.group(0))

def analyze_one(rid: str, text: str) -> Dict[str, Any]:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_TEMPLATE.format(rid=rid, text=text)},
    ]
    out = call_ollama(messages)
    return extract_json(out)

# ---- 入力例（CSV想定）----
# df_in = pd.read_csv("responses.csv")  # columns: respondent_id, text
df_in = pd.DataFrame({
    "respondent_id": ["R001","R002"],
    "text": ["操作は直感的だが初期設定が難しい。マニュアルも欲しい。", "処理が遅い。UIは見やすい。"]
})

records = []
for rid, text in tqdm(df_in[["respondent_id","text"]].itertuples(index=False, name=None), total=len(df_in)):
    try:
        obj = analyze_one(rid, text)
        for u in obj["units"]:
            records.append({
                "respondent_id": obj["respondent_id"],
                **u
            })
    except Exception as e:
        records.append({
            "respondent_id": rid,
            "unit_id": None,
            "unit_text": None,
            "label": None,
            "category": "PARSE_ERROR",
            "sentiment": None,
            "confidence": 0.0,
            "error": str(e),
            "raw_text": text
        })
    time.sleep(0.1)  # 連続呼び出しの緩和（必要に応じて）

df = pd.DataFrame(records)

# ---- 可視化1：カテゴリ頻度（棒グラフ）----
cat_counts = df[df["category"] != "PARSE_ERROR"].groupby("category").size().sort_values(ascending=False)
plt.figure()
cat_counts.plot(kind="bar")
plt.title("Category Frequency")
plt.xlabel("category")
plt.ylabel("count")
plt.tight_layout()
plt.show()

# ---- 可視化2：カテゴリごとの平均感情（散布図：頻度×感情）----
agg = df[df["category"] != "PARSE_ERROR"].groupby("category").agg(
    count=("category","size"),
    mean_sent=("sentiment","mean")
).reset_index()

plt.figure()
plt.scatter(agg["mean_sent"], agg["count"])
for _, r in agg.iterrows():
    plt.text(r["mean_sent"], r["count"], r["category"])
plt.title("Sentiment vs Frequency (by category)")
plt.xlabel("mean sentiment")
plt.ylabel("count")
plt.tight_layout()
plt.show()

print(df.head())

import json import re import time from typing import Dict, Any, List import requests import pandas as pd from tqdm import tqdm import matplotlib.pyplot as plt OLLAMA_URL = "http://localhost:11434/api/chat" MODEL = "llama3.1:8b" # あなたの環境に合わせて SYSTEM_PROMPT = """あなたはアンケート自由記述を構造化する分析アシスタントです。必ず次のJSONスキーマで出力してください。余計な文章は禁止です。 - respondent_id: 入力のIDをそのまま - units: 意味的に独立した意見単位の配列 - unit_id: 1からの連番 - unit_text: 原文の表現をできるだけ保持（解釈して言い換えない） - label: 3〜10文字程度の中立的な話題名詞（良い/悪い等は禁止） - category: 5〜8カテゴリに収まる粒度（例: UI/操作性, 導入・設定, 性能, サポート, 価格, ドキュメント, 信頼性など） - sentiment: -1.0〜+1.0（否定=マイナス、肯定=プラス、中立=0） - confidence: 0.0〜1.0（自信度）出力はJSONのみ。""" USER_TEMPLATE = """respondent_id: {rid} text: {text} """ def call_ollama(messages: List[Dict[str, str]], temperature: float = 0.2) -> str: payload = { "model": MODEL, "messages": messages, "options": {"temperature": temperature}, "stream": False, } r = requests.post(OLLAMA_URL, json=payload, timeout=120) r.raise_for_status() return r.json()["message"]["content"] def extract_json(text: str) -> Dict[str, Any]: # 余計な文字が混ざった場合に備えて最初のJSONブロックを抽出 m = re.search(r"\{.*\}\s*$", text, re.DOTALL) if not m: raise ValueError("JSONが見つかりません") return json.loads(m.group(0)) def analyze_one(rid: str, text: str) -> Dict[str, Any]: messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": USER_TEMPLATE.format(rid=rid, text=text)}, ] out = call_ollama(messages) return extract_json(out) # ---- 入力例（CSV想定）---- # df_in = pd.read_csv("responses.csv") # columns: respondent_id, text df_in = pd.DataFrame({ "respondent_id": ["R001","R002"], "text": ["操作は直感的だが初期設定が難しい。マニュアルも欲しい。", "処理が遅い。UIは見やすい。"] }) records = [] for rid, text in tqdm(df_in[["respondent_id","text"]].itertuples(index=False, name=None), total=len(df_in)): try: obj = analyze_one(rid, text) for u in obj["units"]: records.append({ "respondent_id": obj["respondent_id"], **u }) except Exception as e: records.append({ "respondent_id": rid, "unit_id": None, "unit_text": None, "label": None, "category": "PARSE_ERROR", "sentiment": None, "confidence": 0.0, "error": str(e), "raw_text": text }) time.sleep(0.1) # 連続呼び出しの緩和（必要に応じて） df = pd.DataFrame(records) # ---- 可視化1：カテゴリ頻度（棒グラフ）---- cat_counts = df[df["category"] != "PARSE_ERROR"].groupby("category").size().sort_values(ascending=False) plt.figure() cat_counts.plot(kind="bar") plt.title("Category Frequency") plt.xlabel("category") plt.ylabel("count") plt.tight_layout() plt.show() # ---- 可視化2：カテゴリごとの平均感情（散布図：頻度×感情）---- agg = df[df["category"] != "PARSE_ERROR"].groupby("category").agg( count=("category","size"), mean_sent=("sentiment","mean") ).reset_index() plt.figure() plt.scatter(agg["mean_sent"], agg["count"]) for _, r in agg.iterrows(): plt.text(r["mean_sent"], r["count"], r["category"]) plt.title("Sentiment vs Frequency (by category)") plt.xlabel("mean sentiment") plt.ylabel("count") plt.tight_layout() plt.show() print(df.head())