commit 6281ff6c7a0ba3242fac6204692731ce720ce80c
Author: Inoue Yosuke <[email protected]>
Date: Sat, 4 Nov 2023 20:07:31 +0900
Initial commit
Diffstat:
A | LICENSE | | | 24 | ++++++++++++++++++++++++ |
A | README.md | | | 7 | +++++++ |
A | extractmsg.py | | | 67 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 98 insertions(+), 0 deletions(-)
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/README.md b/README.md
@@ -0,0 +1,7 @@
+# slackexport-rectify
+Slack のエクスポート機能を使ってエクスポートしたデータから、メッセージっぽい文章を抜き出して JSON に保存する
+
+## Usage
+```sh
+./extractmsg.py /path/to/users.json /path/to/channel/directory/*.json
+```
diff --git a/extractmsg.py b/extractmsg.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python
+# Usage: extractmsg.py ./path/to/users.json ./path/to/messages_1.json ./path/to/messages_2.json ...
+
+from datetime import datetime, timezone
+from html.parser import HTMLParser
+import json
+import sys
+
+class sanitizer(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.text = ""
+ def handle_starttag(self, tag, attrs):
+ if tag == "br":
+ self.text += "\n"
+ def handle_data(self, d):
+ if d.strip():
+ self.text += d
+ def sanitize(self, str):
+ self.feed(str)
+ self.close()
+ return self.text
+
+class users():
+ def __init__(self, lst):
+ self.d = {}
+ self.r = []
+ for e in lst:
+ id = e["id"]
+ name = (lambda p: p["display_name_normalized"] or p["real_name_normalized"])(e["profile"])
+ self.d[id] = name
+ self.r.append(("<@{}>".format(id), "@{}".format(name)))
+ def get(self, id):
+ return self.d.get(id, id)
+ def replace(self, msg):
+ for (f, t) in self.r:
+ msg = msg.replace(f, t)
+ return msg
+
+def parse_msg(users, obj):
+ if obj["type"] != "message" or "subtype" in obj:
+ return None
+
+ text = obj["text"]
+ if atts := obj.get("attachments", None):
+ if t := atts[0].get("text", None):
+ text = t
+ elif fs := obj.get("files", None):
+ if h := fs[0].get("preview", None):
+ text = sanitizer().sanitize(h)
+
+ return {
+ "dt": datetime.fromtimestamp(float(obj["ts"])).replace(tzinfo=timezone.utc).isoformat(),
+ "text": users.replace(text),
+ "username": users.get(obj["user"]),
+ }
+
+if __name__ == '__main__':
+ with open(sys.argv[1], 'rb') as f:
+ usrs = users(json.load(f))
+
+ accum = []
+ for fn in sys.argv[2:]:
+ with open(fn, 'rb') as f:
+ p = map(lambda o: parse_msg(usrs, o), json.load(f))
+ accum += list(filter(lambda e: e is not None, p))
+ print(json.dumps(accum, ensure_ascii=False))