From 96b95d07addd0ce9badfcdeeece9e25f612ce608 Mon Sep 17 00:00:00 2001 From: Arkadiy Shapkin Date: Wed, 19 Feb 2020 20:24:52 +0300 Subject: [PATCH] Support parsing Json export from Telegram Desktop Client --- README.md | 35 ++++++---- config.yml | 7 +- parse.py | 20 ++++-- parsers/{telegram.py => telegram_api.py} | 2 +- parsers/telegram_json.py | 85 ++++++++++++++++++++++++ utils.py | 2 +- 6 files changed, 130 insertions(+), 21 deletions(-) rename parsers/{telegram.py => telegram_api.py} (97%) mode change 100755 => 100644 create mode 100644 parsers/telegram_json.py diff --git a/README.md b/README.md index 863b5c4..f77c869 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs. ### Support Matrix -| Platform | Direct Chat | Group Chat | -|:------------------:|:-----------: |:----------:| -| Facebook Messenger | ✔ | ✘ | -| Google Hangouts | ✔ | ✘ | -| Telegram | ✔ | ✘ | -| WhatsApp | ✔ | ✔ | +| Platform | Direct Chat | Group Chat | +|:-------------------------:|:-----------:|:----------:| +| Facebook Messenger | ✔ | ✘ | +| Google Hangouts | ✔ | ✘ | +| Telegram (API) | ✔ | ✘ | +| Telegram (Desktop Client) | ✔ | ✔ | +| WhatsApp | ✔ | ✔ | ### Exported data @@ -76,9 +77,16 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon 4. Send chat to yourself eg via Email 5. Unpack the archive and add the individual .txt files to the folder `./raw_data/whatsapp/` -### Telegram +### Telegram (Desktop Client) -The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast. +1. Open Telegram Desktop Client +2. Open Settings > Export Telegram data +5. Unpack result.json file to the folder `./raw_data/telegram/` + +### Telegram (API) + +The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. +This process is documented below. Exporting Telegram chat logs is very fast. ## 2. Setup Chatistics @@ -102,18 +110,21 @@ python parse.py messenger # WhatsApp python parse.py whatsapp + +# Telegram (Desktop Client) +python parse.py telegram_json ``` -### Telegram +### Telegram (API) 1. Create your Telegram application to access chat logs ([instructions](https://core.telegram.org/api/obtaining_api_id)). You will need `api_id` and `api_hash` which we will now set as environment variables. 2. Run `cp secrets.sh.example secrets.sh` and fill in the values for the environment variables `TELEGRAM_API_ID`, `TELEGRAMP_API_HASH` and `TELEGRAM_PHONE` (your phone number including country code). 3. Run `source secrets.sh` -4. Execute the parser script using `python parse.py telegram` +4. Execute the parser script using `python parse.py telegram_api` The pickle files will now be ready for analysis in the `data` folder! -For more options use the `-h` argument on the parsers (e.g. `python parse.py telegram --help`). +For more options use the `-h` argument on the parsers (e.g. `python parse.py telegram_api --help`). ## 3. All done! Play with your data @@ -144,7 +155,7 @@ Among other options you can filter messages as needed (also see `python visualiz ``` --platforms {telegram,whatsapp,messenger,hangouts} - Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts']) + Use data only from certain platforms (default: ['telegram_api', 'telegram_json', 'whatsapp', 'messenger', 'hangouts']) --filter-conversation Limit by conversations with this person/group (default: []) --filter-sender diff --git a/config.yml b/config.yml index 2332bed..6e39b43 100644 --- a/config.yml +++ b/config.yml @@ -15,9 +15,12 @@ hangouts: messenger: DEFAULT_RAW_LOCATION: 'raw_data/messenger' OUTPUT_PICKLE_NAME: 'messenger.pkl' -telegram: +telegram_api: USER_DIALOG_MESSAGES_LIMIT: 100000 - OUTPUT_PICKLE_NAME: 'telegram.pkl' + OUTPUT_PICKLE_NAME: 'telegram_api.pkl' +telegram_json: + DEFAULT_RAW_LOCATION: 'raw_data/telegram/result.json' + OUTPUT_PICKLE_NAME: 'telegram_json.pkl' whatsapp: DEFAULT_RAW_LOCATION: 'raw_data/whatsapp' OUTPUT_PICKLE_NAME: 'whatsapp.pkl' diff --git a/parse.py b/parse.py index 45bc623..702a13c 100644 --- a/parse.py +++ b/parse.py @@ -8,7 +8,8 @@ python parse.py [] Available commands: - telegram Parse logs from telegram + telegram_api Parse logs from telegram (api) + telegram_json Parse logs from telegram (desktop client) hangouts Parse logs from hangouts messenger Parse logs from messenger whatsapp Parse logs from whatsapp @@ -41,15 +42,24 @@ def __init__(self): sys.exit(1) getattr(self, args.command)() - def telegram(self): - from parsers.telegram import main - parser = ArgParseDefault(description='Parse message logs from Telegram') + def telegram_api(self): + from parsers.telegram_api import main + parser = ArgParseDefault(description='Parse message logs from Telegram (API)') parser = add_common_parse_arguments(parser) - parser.add_argument('--max-dialog', dest='max_dialog', type=int, default=config['telegram']['USER_DIALOG_MESSAGES_LIMIT'], + parser.add_argument('--max-dialog', dest='max_dialog', type=int, default=config['telegram_api']['USER_DIALOG_MESSAGES_LIMIT'], help='Maximum number of messages to export per dialog') args = parser.parse_args(sys.argv[2:]) main(args.own_name, max_exported_messages=args.max, user_dialog_messages_limit=args.max_dialog) + def telegram_json(self): + from parsers.telegram_json import main + parser = ArgParseDefault(description='Parse message logs from Telegram (Desktop Client)') + parser = add_common_parse_arguments(parser) + parser.add_argument('-f', '--file-path', dest='file_path', default=config['telegram_json']['DEFAULT_RAW_LOCATION'], + help='Path to Telegram chat log file (json file)') + args = parser.parse_args(sys.argv[2:]) + main(args.own_name, args.file_path, args.max) + def hangouts(self): from parsers.hangouts import main parser = ArgParseDefault(description='Parse message logs from Google Hangouts') diff --git a/parsers/telegram.py b/parsers/telegram_api.py old mode 100755 new mode 100644 similarity index 97% rename from parsers/telegram.py rename to parsers/telegram_api.py index 5c23558..b885a83 --- a/parsers/telegram.py +++ b/parsers/telegram_api.py @@ -61,7 +61,7 @@ async def _main_loop(client): df['platform'] = 'telegram' log.info('Detecting languages...') df = detect_language(df) - export_dataframe(df, config['telegram']['OUTPUT_PICKLE_NAME']) + export_dataframe(df, config['telegram_api']['OUTPUT_PICKLE_NAME']) log.info('Done.') diff --git a/parsers/telegram_json.py b/parsers/telegram_json.py new file mode 100644 index 0000000..a45fd93 --- /dev/null +++ b/parsers/telegram_json.py @@ -0,0 +1,85 @@ +from parsers.config import config +from parsers.utils import export_dataframe, detect_language +from dateutil.parser import parse +import json +import pandas as pd +import logging +from collections import defaultdict +import os + +log = logging.getLogger(__name__) + + +def main(own_name, file_path, max_exported_messages): + global MAX_EXPORTED_MESSAGES + MAX_EXPORTED_MESSAGES = max_exported_messages + log.info('Parsing Google Hangouts data...') + if not os.path.isfile(file_path): + log.error(f'No input file under {file_path}') + exit(0) + archive = read_archive(file_path) + if own_name is None: + own_name = " ".join([archive["personal_information"]["first_name"], archive["personal_information"]["last_name"]]) + own_id = archive["personal_information"]["user_id"] + data = parse_messages(archive, own_id) + log.info('{:,} messages parsed.'.format(len(data))) + if len(data) < 1: + log.info('Nothing to save.') + exit(0) + log.info('Converting to DataFrame...') + df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) + df['platform'] = 'telegram' + log.info('Detecting languages...') + df = detect_language(df) + export_dataframe(df, config['telegram_json']['OUTPUT_PICKLE_NAME']) + log.info('Done.') + + +def parse_messages(archive, own_id): + def json_to_text(data): + result = "" + for v in data: + if isinstance(v, dict): + result += v["text"] + else: + result += v + return result + + data = [] + log.info('Extracting messages...') + for chat in archive["chats"]["list"]: + chat_type = chat["type"] + if chat_type == "personal_chat" or chat_type == "private_group" or chat_type == "private_supergroup": + conversation_with_id = chat["id"] + conversation_with_name = chat["name"] + for message in chat["messages"]: + if message["type"] != "message": + continue + timestamp = parse(message["date"]).timestamp() + # skip text from forwarded messages + text = message["text"] if "forwarded_from" not in message else "" + if "sticker_emoji" in message: + text = message["sticker_emoji"] + if isinstance(text, list): + text = json_to_text(text) + sender_name = message["from"] + sender_id = message["from_id"] + if sender_name is None: + # unknown sender + log.error(f"No senderName could be found for senderId ({sender_id})") + + # saves the message + outgoing = sender_id == own_id + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, text, '', '']] + + if len(data) >= MAX_EXPORTED_MESSAGES: + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') + return data + return data + + +def read_archive(file_path): + log.info(f'Reading archive file {file_path}...') + with open(file_path, encoding='utf-8') as f: + archive = json.loads(f.read()) + return archive diff --git a/utils.py b/utils.py index 50659a6..430d1fb 100644 --- a/utils.py +++ b/utils.py @@ -16,7 +16,7 @@ def __init__(self, **kwargs): def add_load_data_args(parser): """Adds common data loader arguments to arg parser""" - platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts'] + platforms = ['telegram_api', 'telegram_json', 'whatsapp', 'messenger', 'hangouts'] parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms') parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group') parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender')