Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for updated takeout format #2

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sms/
gvoice-all.xml
24 changes: 21 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,24 @@
# gvoice-sms-takeout-xml
Convert Google Voice SMS data from Takeout to .xml suitable for use with SMS Backup and Restore
Convert Google Voice SMS data from Takeout to .xml suitable for use with SMS Backup and Restore.
Input data is a folder of SMS .html files from Google Takeout.

Working as of 2020-04-22.

## How to use:
1. Go to https://contacts.google.com
2. Export all Google Contacts
3. Delete all Google Contacts (this is required so that numbers show up for each thread, otherwise Takeout will sometimes only have names. If you want to skip this step, you can, but some messages won't be linked to the right thread if you do. Note that this may remove Contact Photos on iOS if you don't pause syncing on your iOS device)
4. Get Google Voice Takeout and Download
5. Restore contacts to your account
6. Download this script to your computer
7. Extract Google Voice Takeout and move the folder into the same folder as this script
8. Open terminal
9. Install pip (sudo easy_install pip)
10. sudo pip install virtualenv
11. virtualenv sms
12. pip install phonenumbers BeautifulSoup4 python-dateutil
13. python sms.py
14. Copy the file "gvoice-all.xml" to your phone, then restore from it using SMS Backup and Restore


This is a personal project from a few years back when Google switched Voice to Hangouts and I wanted to grab my old messages and get them into a usable format. It worked at the time; I don't know if it works as-is, but I'm planning on some testing in the near term to get it functional.

Input data is a folder of SMS .html files from Google Takeout.
164 changes: 113 additions & 51 deletions sms.py
Original file line number Diff line number Diff line change
@@ -1,78 +1,116 @@
from bs4 import BeautifulSoup
import re
import os
import phonenumbers
import dateutil.parser
import time, datetime
from calendar import timegm
import warnings

import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.withdraw()

file_path = filedialog.askopenfilename()

"""
TO DO:
filter emoji
fix double-double quotes
"""
from io import open # adds emoji support

sms_backup_filename = "./gvoice-all.xml"
print('New file will be saved to ' + sms_backup_filename)

def main():

print('Checking directory for *.html files')
num_sms = 0
root_dir = '.'

for subdir, dirs, files in os.walk(root_dir):
for file in files:
sms_filename = os.path.join(subdir, file)
#print(sms_filename)

try:
sms_file = open(sms_filename, 'r')
except FileNotFoundError:
continue

if(os.path.splitext(sms_filename)[1] != '.html'):
print(sms_filename,"- skipped")
# print(sms_filename,"- skipped")
continue
print(sms_filename)

print('Processing ' + sms_filename)

is_group_conversation = re.match(r'(^Group Conversation)', file)

soup = BeautifulSoup(sms_file, 'html.parser')

messages_raw = soup.find_all(class_='message')

num_sms += len(messages_raw)

sms_values = {'phone' : get_phone(messages_raw)}

for i in range(len(messages_raw)):
## print('Unix time:',get_time_unix(messages_raw[i]))
## print('Sender:',get_phone(messages_raw[i]))
## print('Type:',get_message_type(messages_raw[i]))
## print('Message text:',get_message_text(messages_raw[i]))
## print('-----')
sms_values['type'] = get_message_type(messages_raw[i])
sms_values['message'] = get_message_text(messages_raw[i])
sms_values['time'] = get_time_unix(messages_raw[i])
sms_text = ('<sms protocol="0" address="%(phone)s" '
'date="%(time)s" type="%(type)s" '
'subject="null" body="%(message)s" '
'toa="null" sc_toa="null" service_center="null" '
'read="1" status="1" locked="0" /> \n' % sms_values)
sms_backup_file = open(sms_backup_filename, 'a')
sms_backup_file.write(sms_text)
sms_backup_file.close()
if is_group_conversation:
participants_raw = soup.find_all(class_='participants')
write_mms_messages(participants_raw, messages_raw)
else:
write_sms_messages(file, messages_raw)


sms_backup_file = open(sms_backup_filename, 'a')
sms_backup_file.write('</smses>')
sms_backup_file.write(u'</smses>')
sms_backup_file.close()

write_header(sms_backup_filename, num_sms)

def write_sms_messages(file, messages_raw):
fallback_number = 0
title_has_number = re.search(r"(^\+*[0-9]+)", file)
if title_has_number:
fallback_number = title_has_number.group()

sms_values = {'phone' : get_first_phone_number(messages_raw, fallback_number)}

sms_backup_file = open(sms_backup_filename, 'a')
for i in range(len(messages_raw)):
sms_values['type'] = get_message_type(messages_raw[i])
sms_values['message'] = get_message_text(messages_raw[i])
sms_values['time'] = get_time_unix(messages_raw[i])
sms_text = ('<sms protocol="0" address="%(phone)s" '
'date="%(time)s" type="%(type)s" '
'subject="null" body="%(message)s" '
'toa="null" sc_toa="null" service_center="null" '
'read="1" status="1" locked="0" /> \n' % sms_values)
sms_backup_file.write(sms_text)

sms_backup_file.close()

def write_mms_messages(participants_raw, messages_raw):
sms_backup_file = open(sms_backup_filename, 'a')

participants = get_participant_phone_numbers(participants_raw)
mms_values = {'participants' : '~'.join(participants)}

for i in range(len(messages_raw)):
sender = get_mms_sender(messages_raw[i])
sent_by_me = sender not in participants

mms_values['type'] = get_message_type(messages_raw[i])
mms_values['message'] = get_message_text(messages_raw[i])
mms_values['time'] = get_time_unix(messages_raw[i])
mms_values['participants_xml'] = u''
mms_values['msg_box'] = 2 if sent_by_me else 1
mms_values['m_type'] = 128 if sent_by_me else 132

for participant in participants:
participant_is_sender = participant == sender or (sent_by_me and participant == 'Me')
participant_values = {'number': participant, 'code': 137 if participant_is_sender else 151}
mms_values['participants_xml'] += (' <addr address="%(number)s" charset="106" type="%(code)s"/> \n' % participant_values)

mms_text = ('<mms address="%(participants)s" ct_t="application/vnd.wap.multipart.related" '
'date="%(time)s" m_type="%(m_type)s" msg_box="%(msg_box)s" read="1" '
'rr="129" seen="1" sub_id="-1" text_only="1"> \n'
' <parts> \n'
' <part ct="text/plain" seq="0" text="%(message)s"/> \n'
' </parts> \n'
' <addrs> \n'
'%(participants_xml)s'
' </addrs> \n'
'</mms> \n' % mms_values)

sms_backup_file.write(mms_text)

sms_backup_file.close()

def get_message_type(message): # author_raw = messages_raw[i].cite
author_raw = message.cite
if ( not author_raw.span ):
Expand All @@ -85,7 +123,11 @@ def get_message_type(message): # author_raw = messages_raw[i].cite
def get_message_text(message):
return BeautifulSoup(message.find('q').text,'html.parser').prettify(formatter='html').strip().replace('"',"'")

def get_phone(messages):
def get_mms_sender(message):
return format_number(phonenumbers.parse(message.cite.a['href'][4:], None))

def get_first_phone_number(messages, fallback_number):
# handle group messages
for author_raw in messages:
if (not author_raw.span):
continue
Expand All @@ -97,18 +139,38 @@ def get_phone(messages):
except phonenumbers.phonenumberutil.NumberParseException:
return sender_data.a['href'][4:]

if(phone_number.country_code == 1):
return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)[1:].replace(' ', '-')
else:
return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164)
return 0
return format_number(phone_number)

# fallback case, use number from filename
if (fallback_number == 0 or len(fallback_number) < 7):
return fallback_number
else:
return format_number(phonenumbers.parse(fallback_number, None))

def get_participant_phone_numbers(participants_raw):
participants = ['Me'] # May require adding a contact for "Me" to your phone, with your current number

for participant_set in participants_raw:
for participant in participant_set:
if (not hasattr(participant, 'a')):
continue

try:
phone_number = phonenumbers.parse(participant.a['href'][4:], None)
except phonenumbers.phonenumberutil.NumberParseException:
participants.push(participant.a['href'][4:])

participants.append(format_number(phone_number))

return participants

def format_number(phone_number):
return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164)

def get_time_unix(message):
time_raw = message.find(class_='dt')
ymdhms = time_raw['title']
time_obj = datetime.datetime.strptime(ymdhms.replace('Z','UTC'), '%Y-%m-%dT%H:%M:%S.%f%Z')
#print('GV Date: ', ymdhms)
#mstime = time.mktime(time_obj.timetuple()) * 1000 + time_obj.microsecond / 1000
time_obj = dateutil.parser.isoparse(ymdhms);
mstime = timegm(time_obj.timetuple()) * 1000 + time_obj.microsecond / 1000
return int(mstime)

Expand All @@ -118,10 +180,10 @@ def write_header(filename, numsms):
backup_file.close()

backup_file = open(filename, 'w')
backup_file.write("<?xml version='1.0' encoding='UTF-8' standalone='yes' ?>\n")
backup_file.write("<!--Converted from GV Takeout data -->\n")
backup_file.write('<smses count="' + str(numsms) + '">\n')
backup_file.write(u"<?xml version='1.0' encoding='UTF-8' standalone='yes' ?>\n")
backup_file.write(u"<!--Converted from GV Takeout data -->\n")
backup_file.write(u'<smses count="' + str(numsms) + u'">\n')
backup_file.write(backup_text)
backup_file.close()

main()