diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..95476f3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +sms/ +gvoice-all.xml diff --git a/README.md b/README.md index 98c0673..d37c868 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,24 @@ # gvoice-sms-takeout-xml -Convert Google Voice SMS data from Takeout to .xml suitable for use with SMS Backup and Restore +Convert Google Voice SMS data from Takeout to .xml suitable for use with SMS Backup and Restore. +Input data is a folder of SMS .html files from Google Takeout. + +Working as of 2020-04-22. + +## How to use: +1. Go to https://contacts.google.com +2. Export all Google Contacts +3. Delete all Google Contacts (this is required so that numbers show up for each thread, otherwise Takeout will sometimes only have names. If you want to skip this step, you can, but some messages won't be linked to the right thread if you do. Note that this may remove Contact Photos on iOS if you don't pause syncing on your iOS device) +4. Get Google Voice Takeout and Download +5. Restore contacts to your account +6. Download this script to your computer +7. Extract Google Voice Takeout and move the folder into the same folder as this script +8. Open terminal +9. Install pip (sudo easy_install pip) +10. sudo pip install virtualenv +11. virtualenv sms +12. pip install phonenumbers BeautifulSoup4 python-dateutil +13. python sms.py +14. Copy the file "gvoice-all.xml" to your phone, then restore from it using SMS Backup and Restore + -This is a personal project from a few years back when Google switched Voice to Hangouts and I wanted to grab my old messages and get them into a usable format. It worked at the time; I don't know if it works as-is, but I'm planning on some testing in the near term to get it functional. -Input data is a folder of SMS .html files from Google Takeout. diff --git a/sms.py b/sms.py index 1809586..98e54d4 100644 --- a/sms.py +++ b/sms.py @@ -1,35 +1,24 @@ from bs4 import BeautifulSoup +import re import os import phonenumbers +import dateutil.parser import time, datetime from calendar import timegm import warnings - -import tkinter as tk -from tkinter import filedialog - -root = tk.Tk() -root.withdraw() - -file_path = filedialog.askopenfilename() - -""" -TO DO: -filter emoji -fix double-double quotes -""" +from io import open # adds emoji support sms_backup_filename = "./gvoice-all.xml" +print('New file will be saved to ' + sms_backup_filename) def main(): - + print('Checking directory for *.html files') num_sms = 0 root_dir = '.' for subdir, dirs, files in os.walk(root_dir): for file in files: sms_filename = os.path.join(subdir, file) - #print(sms_filename) try: sms_file = open(sms_filename, 'r') @@ -37,9 +26,12 @@ def main(): continue if(os.path.splitext(sms_filename)[1] != '.html'): - print(sms_filename,"- skipped") + # print(sms_filename,"- skipped") continue - print(sms_filename) + + print('Processing ' + sms_filename) + + is_group_conversation = re.match(r'(^Group Conversation)', file) soup = BeautifulSoup(sms_file, 'html.parser') @@ -47,32 +39,78 @@ def main(): num_sms += len(messages_raw) - sms_values = {'phone' : get_phone(messages_raw)} - - for i in range(len(messages_raw)): - ## print('Unix time:',get_time_unix(messages_raw[i])) - ## print('Sender:',get_phone(messages_raw[i])) - ## print('Type:',get_message_type(messages_raw[i])) - ## print('Message text:',get_message_text(messages_raw[i])) - ## print('-----') - sms_values['type'] = get_message_type(messages_raw[i]) - sms_values['message'] = get_message_text(messages_raw[i]) - sms_values['time'] = get_time_unix(messages_raw[i]) - sms_text = (' \n' % sms_values) - sms_backup_file = open(sms_backup_filename, 'a') - sms_backup_file.write(sms_text) - sms_backup_file.close() + if is_group_conversation: + participants_raw = soup.find_all(class_='participants') + write_mms_messages(participants_raw, messages_raw) + else: + write_sms_messages(file, messages_raw) + sms_backup_file = open(sms_backup_filename, 'a') - sms_backup_file.write('') + sms_backup_file.write(u'') sms_backup_file.close() write_header(sms_backup_filename, num_sms) +def write_sms_messages(file, messages_raw): + fallback_number = 0 + title_has_number = re.search(r"(^\+*[0-9]+)", file) + if title_has_number: + fallback_number = title_has_number.group() + + sms_values = {'phone' : get_first_phone_number(messages_raw, fallback_number)} + + sms_backup_file = open(sms_backup_filename, 'a') + for i in range(len(messages_raw)): + sms_values['type'] = get_message_type(messages_raw[i]) + sms_values['message'] = get_message_text(messages_raw[i]) + sms_values['time'] = get_time_unix(messages_raw[i]) + sms_text = (' \n' % sms_values) + sms_backup_file.write(sms_text) + + sms_backup_file.close() + +def write_mms_messages(participants_raw, messages_raw): + sms_backup_file = open(sms_backup_filename, 'a') + + participants = get_participant_phone_numbers(participants_raw) + mms_values = {'participants' : '~'.join(participants)} + + for i in range(len(messages_raw)): + sender = get_mms_sender(messages_raw[i]) + sent_by_me = sender not in participants + + mms_values['type'] = get_message_type(messages_raw[i]) + mms_values['message'] = get_message_text(messages_raw[i]) + mms_values['time'] = get_time_unix(messages_raw[i]) + mms_values['participants_xml'] = u'' + mms_values['msg_box'] = 2 if sent_by_me else 1 + mms_values['m_type'] = 128 if sent_by_me else 132 + + for participant in participants: + participant_is_sender = participant == sender or (sent_by_me and participant == 'Me') + participant_values = {'number': participant, 'code': 137 if participant_is_sender else 151} + mms_values['participants_xml'] += (' \n' % participant_values) + + mms_text = (' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '%(participants_xml)s' + ' \n' + ' \n' % mms_values) + + sms_backup_file.write(mms_text) + + sms_backup_file.close() + def get_message_type(message): # author_raw = messages_raw[i].cite author_raw = message.cite if ( not author_raw.span ): @@ -85,7 +123,11 @@ def get_message_type(message): # author_raw = messages_raw[i].cite def get_message_text(message): return BeautifulSoup(message.find('q').text,'html.parser').prettify(formatter='html').strip().replace('"',"'") -def get_phone(messages): +def get_mms_sender(message): + return format_number(phonenumbers.parse(message.cite.a['href'][4:], None)) + +def get_first_phone_number(messages, fallback_number): + # handle group messages for author_raw in messages: if (not author_raw.span): continue @@ -97,18 +139,38 @@ def get_phone(messages): except phonenumbers.phonenumberutil.NumberParseException: return sender_data.a['href'][4:] - if(phone_number.country_code == 1): - return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)[1:].replace(' ', '-') - else: - return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164) - return 0 + return format_number(phone_number) + + # fallback case, use number from filename + if (fallback_number == 0 or len(fallback_number) < 7): + return fallback_number + else: + return format_number(phonenumbers.parse(fallback_number, None)) + +def get_participant_phone_numbers(participants_raw): + participants = ['Me'] # May require adding a contact for "Me" to your phone, with your current number + + for participant_set in participants_raw: + for participant in participant_set: + if (not hasattr(participant, 'a')): + continue + + try: + phone_number = phonenumbers.parse(participant.a['href'][4:], None) + except phonenumbers.phonenumberutil.NumberParseException: + participants.push(participant.a['href'][4:]) + + participants.append(format_number(phone_number)) + + return participants + +def format_number(phone_number): + return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164) def get_time_unix(message): time_raw = message.find(class_='dt') ymdhms = time_raw['title'] - time_obj = datetime.datetime.strptime(ymdhms.replace('Z','UTC'), '%Y-%m-%dT%H:%M:%S.%f%Z') - #print('GV Date: ', ymdhms) - #mstime = time.mktime(time_obj.timetuple()) * 1000 + time_obj.microsecond / 1000 + time_obj = dateutil.parser.isoparse(ymdhms); mstime = timegm(time_obj.timetuple()) * 1000 + time_obj.microsecond / 1000 return int(mstime) @@ -118,10 +180,10 @@ def write_header(filename, numsms): backup_file.close() backup_file = open(filename, 'w') - backup_file.write("\n") - backup_file.write("\n") - backup_file.write('\n') + backup_file.write(u"\n") + backup_file.write(u"\n") + backup_file.write(u'\n') backup_file.write(backup_text) backup_file.close() - + main()