История изменений
Исправление arcanis, (текущая версия) :
Первый раз слышу про проблемы с парсингом имейла.
видимо, ты их не парсил руками. Вот тебе кусок старого кода на питоне, который парсит _только_ часть заголовков
# get mail text
try:
with open(os.path.join(path, mail), 'r') as mail_file:
attached_mail = email.message_from_file(mail_file)
except (IOError, TypeError):
logging.error('Could not get data from {}'.format(mail), exc_info=True)
continue
# to fields
rec_list = attached_mail.get_all('To', [])
bcc_list = attached_mail.get_all('Bcc', [])
cc_list = attached_mail.get_all('Cc', [])
recipients = ', '.join([addr[1] for addr in email.utils.getaddresses(rec_list + bcc_list + cc_list)])
if not recipients:
# there is no any recipients
# seems to be bug in the Google API, a mail named as 'Attachment'
continue
# from fields
sender = ', '.join([addr[1] for addr in email.utils.getaddresses(attached_mail.get_all('From', []))])
# decode subject
subject, subj_encoding = email.Header.decode_header(attached_mail.get('Subject', ''))[0]
if subj_encoding is not None:
try:
subject = subject.decode(subj_encoding)
except:
# fuck you users which mail agents send invalid encoding
pass
# date
date, date_encoding = email.Header.decode_header(attached_mail.get('Date', ''))[0]
except:
if date_encoding is not None:
date = date.decode(date_encoding)
try:
date = date_parser.parse(date)
except ValueError:
# some clients send invalid TZ/datetime format, lets assign current date
date = datetime.datetime.now()
но на бумажке выглядит просто, да
Исходная версия arcanis, :
Первый раз слышу про проблемы с парсингом имейла.
видимо, ты их не парсил руками. Вот тебе кусок старого кода на питоне, который парсит _только_ заголовки по сути
# get mail text
try:
with open(os.path.join(path, mail), 'r') as mail_file:
attached_mail = email.message_from_file(mail_file)
except (IOError, TypeError):
logging.error('Could not get data from {}'.format(mail), exc_info=True)
continue
# to fields
rec_list = attached_mail.get_all('To', [])
bcc_list = attached_mail.get_all('Bcc', [])
cc_list = attached_mail.get_all('Cc', [])
recipients = ', '.join([addr[1] for addr in email.utils.getaddresses(rec_list + bcc_list + cc_list)])
if not recipients:
# there is no any recipients
# seems to be bug in the Google API, a mail named as 'Attachment'
continue
# from fields
sender = ', '.join([addr[1] for addr in email.utils.getaddresses(attached_mail.get_all('From', []))])
# decode subject
subject, subj_encoding = email.Header.decode_header(attached_mail.get('Subject', ''))[0]
if subj_encoding is not None:
try:
subject = subject.decode(subj_encoding)
except:
# fuck you users which mail agents send invalid encoding
pass
# date
date, date_encoding = email.Header.decode_header(attached_mail.get('Date', ''))[0]
except:
if date_encoding is not None:
date = date.decode(date_encoding)
try:
date = date_parser.parse(date)
except ValueError:
# some clients send invalid TZ/datetime format, lets assign current date
date = datetime.datetime.now()
но на бумажке выглядит просто, да