slackexport-rectify

Slack のエクスポートデータから、メッセージ本文をいい感じにとってくる
git clone https://git.kamikakushi.net/slackexport-rectify.git
Log | Files | Refs | README | LICENSE

commit 6281ff6c7a0ba3242fac6204692731ce720ce80c
Author: Inoue Yosuke <[email protected]>
Date:   Sat,  4 Nov 2023 20:07:31 +0900

Initial commit

Diffstat:
ALICENSE | 24++++++++++++++++++++++++
AREADME.md | 7+++++++
Aextractmsg.py | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 98 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/README.md b/README.md @@ -0,0 +1,7 @@ +# slackexport-rectify +Slack のエクスポート機能を使ってエクスポートしたデータから、メッセージっぽい文章を抜き出して JSON に保存する + +## Usage +```sh +./extractmsg.py /path/to/users.json /path/to/channel/directory/*.json +``` diff --git a/extractmsg.py b/extractmsg.py @@ -0,0 +1,67 @@ +#!/usr/bin/python +# Usage: extractmsg.py ./path/to/users.json ./path/to/messages_1.json ./path/to/messages_2.json ... + +from datetime import datetime, timezone +from html.parser import HTMLParser +import json +import sys + +class sanitizer(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.text = "" + def handle_starttag(self, tag, attrs): + if tag == "br": + self.text += "\n" + def handle_data(self, d): + if d.strip(): + self.text += d + def sanitize(self, str): + self.feed(str) + self.close() + return self.text + +class users(): + def __init__(self, lst): + self.d = {} + self.r = [] + for e in lst: + id = e["id"] + name = (lambda p: p["display_name_normalized"] or p["real_name_normalized"])(e["profile"]) + self.d[id] = name + self.r.append(("<@{}>".format(id), "@{}".format(name))) + def get(self, id): + return self.d.get(id, id) + def replace(self, msg): + for (f, t) in self.r: + msg = msg.replace(f, t) + return msg + +def parse_msg(users, obj): + if obj["type"] != "message" or "subtype" in obj: + return None + + text = obj["text"] + if atts := obj.get("attachments", None): + if t := atts[0].get("text", None): + text = t + elif fs := obj.get("files", None): + if h := fs[0].get("preview", None): + text = sanitizer().sanitize(h) + + return { + "dt": datetime.fromtimestamp(float(obj["ts"])).replace(tzinfo=timezone.utc).isoformat(), + "text": users.replace(text), + "username": users.get(obj["user"]), + } + +if __name__ == '__main__': + with open(sys.argv[1], 'rb') as f: + usrs = users(json.load(f)) + + accum = [] + for fn in sys.argv[2:]: + with open(fn, 'rb') as f: + p = map(lambda o: parse_msg(usrs, o), json.load(f)) + accum += list(filter(lambda e: e is not None, p)) + print(json.dumps(accum, ensure_ascii=False))