Discord メッセージ抽出Bot
作成ガイド
Discordサーバー内のメッセージを全抽出し、「カス嘘っぽいメッセージ」を判別・抽出するガイド
ゴール
Discordサーバー内メッセージを全抽出してPCに保存する
その中から「カス嘘っぽいメッセージ」を判別して抽出する
最終的に「カス嘘っぽいメッセージ一覧(CSV)」を作る
作業フォルダを作る
重要:パス指定どおりにフォルダを作成してください
フォルダ構成
D:\discordbot
手順
- エクスプローラーで Dドライブ を開く
discordbotフォルダを作成
(名前はなんでもOKだけどここでは discordbot で)
VS Code を入れてフォルダを開く
手順
- VS Code をインストール
- VS Codeで「フォルダーを開く」→
D:\discordbotを開く - 「信頼しますか?」→ 信頼する
Discord側:Botを作って招待する
3-1. アプリ作成 → Bot作成
- New Application をクリック
- 左メニュー Bot → Add Bot
3-2. Message Content Intent をON
必須設定!これをONにしないとメッセージを取得できません
- Bot画面 → Privileged Gateway Intents
- Message Content Intent を ON
- Save で保存
3-3. 招待URL作成
- OAuth2 → URL Generator
- Scopes:
botにチェック - Bot Permissions:
View Channels/Read Message Historyにチェック - 生成されたURLでサーバーへ招待
VS Codeの「ターミナル」で準備
VS Code上部メニュー:ターミナル → 新しいターミナル
4-1. 作業フォルダへ移動
cd "D:\discordbot"
4-2. discord.py を入れる
pip install -U discord.py
メッセージ全抽出(JSONL出力)
5-1. export_all_jsonl.py を作成
VS Codeで新しいファイルを作成:export_all_jsonl.py
中身を貼って保存(Ctrl+S)
GUILD_ID = 0 を自分のサーバーIDに書き換えてください
import os
import json
import asyncio
from datetime import datetime, timezone
from typing import Dict, Any, Optional
import discord
# ===================== 設定 =====================
GUILD_ID = 0 # ←ここを自分のサーバーIDに置き換える(例: 1143898913993138209)
OUTPUT_DIR = "./exports"
CHECKPOINT_FILE = os.path.join(OUTPUT_DIR, "checkpoints.json")
FLUSH_EVERY = 200
CP_SAVE_EVERY = 500
YIELD_EVERY = 2000
SKIP_BOT_MESSAGES = False
# ===============================================
def utc_str(dt: datetime) -> str:
return dt.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
def load_checkpoints() -> Dict[str, Any]:
if not os.path.exists(CHECKPOINT_FILE):
return {}
with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
return json.load(f)
def save_checkpoints(cp: Dict[str, Any]) -> None:
tmp = CHECKPOINT_FILE + ".tmp"
with open(tmp, "w", encoding="utf-8") as f:
json.dump(cp, f, ensure_ascii=False, indent=2)
os.replace(tmp, CHECKPOINT_FILE)
def ensure_dir():
os.makedirs(OUTPUT_DIR, exist_ok=True)
intents = discord.Intents.default()
intents.guilds = True
intents.messages = True
intents.message_content = True
class ExportClient(discord.Client):
async def on_ready(self):
print(f"Logged in as: {self.user} (id={self.user.id})")
ensure_dir()
checkpoints = load_checkpoints()
guild = self.get_guild(GUILD_ID)
if guild is None:
guild = await self.fetch_guild(GUILD_ID)
channels = await guild.fetch_channels()
text_channels = [ch for ch in channels if isinstance(ch, discord.TextChannel)]
print(f"Found text channels: {len(text_channels)}")
total = 0
for ch in text_channels:
exported = await export_channel_jsonl(ch, checkpoints)
total += exported
save_checkpoints(checkpoints)
print(f"\nAll done. Total exported messages = {total}")
await self.close()
async def export_channel_jsonl(channel: discord.TextChannel, checkpoints: Dict[str, Any]) -> int:
ch_id = str(channel.id)
ch_name = channel.name
out_path = os.path.join(OUTPUT_DIR, f"channel_{ch_id}.jsonl")
last_ts_str: Optional[str] = None
if ch_id in checkpoints:
last_ts_str = checkpoints[ch_id].get("last_ts_utc")
after_dt: Optional[datetime] = None
if last_ts_str:
after_dt = datetime.strptime(last_ts_str, "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
print(f"\n[RESUME] #{ch_name} ({ch_id}) after {last_ts_str}")
else:
print(f"\n[START] #{ch_name} ({ch_id}) from beginning")
exported = 0
since_flush = 0
since_cp = 0
newest_dt: Optional[datetime] = None
with open(out_path, "a", encoding="utf-8") as f:
async for msg in channel.history(limit=None, oldest_first=True, after=after_dt):
created = msg.created_at.astimezone(timezone.utc)
newest_dt = created
record = {
"message_id": str(msg.id),
"date": utc_str(created),
"author": getattr(msg.author, "display_name", str(msg.author)),
"content": msg.content or "",
"channel_id": ch_id,
"channel_name": ch_name,
}
f.write(json.dumps(record, ensure_ascii=False) + "\n")
exported += 1
since_flush += 1
since_cp += 1
if since_flush >= FLUSH_EVERY:
f.flush()
since_flush = 0
if since_cp >= CP_SAVE_EVERY and newest_dt is not None:
checkpoints[ch_id] = {"last_ts_utc": utc_str(newest_dt)}
save_checkpoints(checkpoints)
since_cp = 0
print(f" ...{exported} msgs (checkpoint {checkpoints[ch_id]['last_ts_utc']})")
if exported % YIELD_EVERY == 0:
await asyncio.sleep(0.2)
if newest_dt is not None:
checkpoints[ch_id] = {"last_ts_utc": utc_str(newest_dt)}
print(f"[DONE] #{ch_name} ({ch_id}) exported={exported}")
return exported
def main():
token = os.getenv("DISCORD_BOT_TOKEN")
if not token:
raise RuntimeError("DISCORD_BOT_TOKEN is not set.")
if GUILD_ID == 0:
raise RuntimeError("Set GUILD_ID in the script (GUILD_ID=...).")
client = ExportClient(intents=intents)
client.run(token)
if __name__ == "__main__":
main()
5-2. トークンを環境変数にセットして実行
注意:トークンは絶対に共有しないでください!
$env:DISCORD_BOT_TOKEN="ここに自分のBotトークン"
python -u export_all_jsonl.py
exports/ に channel_*.jsonl と checkpoints.json ができます
全メッセージ一覧(CSV)を作る
6-1. convert_jsonl_to_csv.py を作成
VS Codeで新しいファイル:convert_jsonl_to_csv.py
import os
import glob
import json
import csv
from typing import Dict, Any, Iterable
INPUT_DIR = "./exports"
OUTPUT_DIR = "./exports_csv"
PER_CHANNEL_DIR = os.path.join(OUTPUT_DIR, "per_channel")
COLUMNS = ["message_id", "date", "author", "content", "channel_id", "channel_name"]
def iter_jsonl(path: str) -> Iterable[Dict[str, Any]]:
with open(path, "r", encoding="utf-8") as f:
for line_no, line in enumerate(f, start=1):
line = line.strip()
if not line:
continue
try:
yield json.loads(line)
except json.JSONDecodeError as e:
print(f"[WARN] JSON decode failed: {path}:{line_no} ({e})")
continue
def ensure_dirs():
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PER_CHANNEL_DIR, exist_ok=True)
def write_csv(path: str, rows: Iterable[Dict[str, Any]], write_header: bool = True) -> int:
count = 0
with open(path, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=COLUMNS, extrasaction="ignore")
if write_header:
writer.writeheader()
for r in rows:
out = {k: (r.get(k, "") if r.get(k, "") is not None else "") for k in COLUMNS}
writer.writerow(out)
count += 1
return count
def main():
ensure_dirs()
jsonl_paths = sorted(glob.glob(os.path.join(INPUT_DIR, "channel_*.jsonl")))
if not jsonl_paths:
raise SystemExit(f"No jsonl files found in {INPUT_DIR}. (expected channel_*.jsonl)")
all_out = os.path.join(OUTPUT_DIR, "all_messages.csv")
def all_rows():
for p in jsonl_paths:
yield from iter_jsonl(p)
total = write_csv(all_out, all_rows())
print(f"[OK] Wrote combined CSV: {all_out} (rows={total})")
per_total = 0
for p in jsonl_paths:
base = os.path.basename(p)
out_path = os.path.join(PER_CHANNEL_DIR, base.replace(".jsonl", ".csv"))
n = write_csv(out_path, iter_jsonl(p))
per_total += n
print(f"[OK] Wrote per-channel CSVs: {PER_CHANNEL_DIR} (rows={per_total}, files={len(jsonl_paths)})")
if __name__ == "__main__":
main()
6-2. 実行
python -u convert_jsonl_to_csv.py
exports_csv/all_messages.csv(全メッセージ一覧)
"カス嘘っぽい"メッセージだけ抽出する
ここが追加工程です。
やることは「カス嘘の本文を正解例として取り込み」→「似てる文体をスコアで判定」→「分別」です。
7-1. 必要ライブラリを入れる
pip install -U requests beautifulsoup4
7-2. filter_lie_style.py を作成
VS Codeで新しいファイル:filter_lie_style.py
import os, glob, json, re, math
from collections import Counter
from typing import List
import requests
from bs4 import BeautifulSoup
# ====== 設定 ======
LIE_URL = "https://c-halberd.com/lie-special/"
INPUT_DIR = "./exports"
OUTPUT_DIR = "./filtered"
OUT_LIE = os.path.join(OUTPUT_DIR, "lie_style.jsonl")
OUT_OTHER = os.path.join(OUTPUT_DIR, "non_lie_style.jsonl")
# しきい値(上げると厳しめ / 下げると拾いすぎ)
THRESHOLD = 0.23
# 日本語は形態素解析なしでも戦える:文字n-gram
NGRAM_N = 3
# "豆知識っぽさ"を上げる特徴(重み)
FEATURE_PATTERNS = [
(re.compile(r"(実は|本来|由来|ルーツ|諸説|とされている|と言われている)"), 0.06),
(re.compile(r"(ため|ので|ことになっている|必要がある|慣習)"), 0.05),
(re.compile(r"(法律|規格|技術|制度|観測|発掘|中世|紀元前|明治|大正|戦後)"), 0.05),
(re.compile(r"(\d+(\.\d+)?)(km|m|cm|GB|%|%|時間|年|ヶ月|月|人)"), 0.05),
(re.compile(r"(という|とは|である|になる|なる)"), 0.03),
]
MIN_LEN = 6
# ==================
def ensure_dirs():
os.makedirs(OUTPUT_DIR, exist_ok=True)
def clean_text(s: str) -> str:
s = (s or "").strip()
s = re.sub(r"\s+", " ", s)
return s
def char_ngrams(s: str, n: int) -> Counter:
s = clean_text(s)
if len(s) < n:
return Counter()
grams = [s[i:i+n] for i in range(len(s)-n+1)]
return Counter(grams)
def cosine(a: Counter, b: Counter) -> float:
if not a or not b:
return 0.0
dot = 0
for k, va in a.items():
vb = b.get(k)
if vb:
dot += va * vb
na = math.sqrt(sum(v*v for v in a.values()))
nb = math.sqrt(sum(v*v for v in b.values()))
if na == 0 or nb == 0:
return 0.0
return dot / (na * nb)
def fetch_lie_items(url: str) -> List[str]:
html = requests.get(url, timeout=30).text
soup = BeautifulSoup(html, "html.parser")
items = []
for li in soup.find_all("li"):
t = clean_text(li.get_text(" ", strip=True))
if len(t) >= 12 and ("http" not in t):
items.append(t)
seen = set()
uniq = []
for x in items:
if x in seen:
continue
seen.add(x)
uniq.append(x)
return uniq
def lie_style_bonus(text: str) -> float:
bonus = 0.0
for pat, w in FEATURE_PATTERNS:
if pat.search(text):
bonus += w
return bonus
def max_similarity(msg: str, lie_vecs: List[Counter]) -> float:
v = char_ngrams(msg, NGRAM_N)
if not v:
return 0.0
best = 0.0
for lv in lie_vecs:
s = cosine(v, lv)
if s > best:
best = s
return best
def main():
ensure_dirs()
lie_items = fetch_lie_items(LIE_URL)
if not lie_items:
raise SystemExit("Failed to fetch lie items. URL or parsing may have changed.")
lie_vecs = [char_ngrams(x, NGRAM_N) for x in lie_items]
paths = sorted(glob.glob(os.path.join(INPUT_DIR, "channel_*.jsonl")))
if not paths:
raise SystemExit(f"No input jsonl files found in {INPUT_DIR}")
lie_count = 0
other_count = 0
with open(OUT_LIE, "w", encoding="utf-8") as f_lie, open(OUT_OTHER, "w", encoding="utf-8") as f_other:
for p in paths:
with open(p, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
except json.JSONDecodeError:
continue
content = clean_text(msg.get("content", ""))
if len(content) < MIN_LEN:
msg["_lie_style"] = False
msg["_lie_score"] = 0.0
other_count += 1
f_other.write(json.dumps(msg, ensure_ascii=False) + "\n")
continue
sim = max_similarity(content, lie_vecs)
bonus = lie_style_bonus(content)
score = sim + bonus
msg["_lie_style"] = score >= THRESHOLD
msg["_lie_score"] = round(score, 4)
msg["_lie_similarity"] = round(sim, 4)
msg["_lie_bonus"] = round(bonus, 4)
if msg["_lie_style"]:
lie_count += 1
f_lie.write(json.dumps(msg, ensure_ascii=False) + "\n")
else:
other_count += 1
f_other.write(json.dumps(msg, ensure_ascii=False) + "\n")
print(f"[OK] lie_style: {lie_count} -> {OUT_LIE}")
print(f"[OK] non_lie_style: {other_count} -> {OUT_OTHER}")
print(f"[TIP] THRESHOLD={THRESHOLD} を上げると厳しく、下げると拾いすぎます。")
if __name__ == "__main__":
main()
7-3. 実行
python -u filter_lie_style.py
filtered/lie_style.jsonl(カス嘘っぽいメッセージ)
filtered/non_lie_style.jsonl(それ以外)
カス嘘っぽいメッセージだけ「メッセージ一覧(CSV)」にする
最後に、filtered/lie_style.jsonl をCSVにします。
8-1. convert_one_jsonl_to_csv.py を作成
VS Codeで新しいファイル:convert_one_jsonl_to_csv.py
import os
import json
import csv
INPUT_JSONL = "./filtered/lie_style.jsonl"
OUTPUT_CSV = "./filtered/lie_style_messages.csv"
COLUMNS = [
"message_id", "date", "author", "content", "channel_id", "channel_name",
"_lie_score", "_lie_similarity", "_lie_bonus"
]
def main():
if not os.path.exists(INPUT_JSONL):
raise SystemExit(f"Input not found: {INPUT_JSONL}")
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
rows = 0
with open(INPUT_JSONL, "r", encoding="utf-8") as f_in, open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f_out:
w = csv.DictWriter(f_out, fieldnames=COLUMNS, extrasaction="ignore")
w.writeheader()
for line in f_in:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
w.writerow({k: obj.get(k, "") for k in COLUMNS})
rows += 1
print(f"[OK] Wrote CSV: {OUTPUT_CSV} (rows={rows})")
if __name__ == "__main__":
main()
8-2. 実行
python -u convert_one_jsonl_to_csv.py
filtered/lie_style_messages.csv(カス嘘っぽいメッセージ一覧)
うまく拾えない/拾いすぎるとき
filter_lie_style.py のこの値だけ調整します:
生成物まとめ(どこに何があるか)
📁 全抽出JSONL
exports/channel_*.jsonl
📁 全メッセージ一覧CSV
exports_csv/all_messages.csv
📁 カス嘘判別後JSONL
filtered/lie_style.jsonlfiltered/non_lie_style.jsonl
📁 カス嘘っぽいメッセージ一覧CSV
filtered/lie_style_messages.csv