import re
from collections import OrderedDict
import string as s
import subprocess as sp
from utils.process_pdf import txt_from_pdf
from utils.process_docx import txt_from_docx

# this headings are currently supported
headings = ['experience', 'education', 'skills', 'interests', 'roles applying for', 'current notice period']

sanit = {}
sanit['names'] = s.ascii_letters+s.digits+'_.'

def sanitize(unsanitized, typ):
    if unsanitized is None:
        return None
    try:
        return ''.join(c for c in unsanitized if c in sanit[typ])
    except:
        return "Invalid"

def get_data_in_each_section(text_list):
	text_list = [i.strip() for i in text_list if i.strip()]
	data_in_each_section = {}
	heading_index = OrderedDict({h:-1 for h in headings})

	for i in range(len(text_list)):
		if text_list[i].strip().lower() in heading_index:
			heading_index[text_list[i]] = i

	for key in list(heading_index.keys()):
		if heading_index[key] == -1:
			del heading_index[key]

	headings_present = list(heading_index.keys())
	data_in_each_section['name'] = text_list[:heading_index[headings_present[0]]]
	data_in_each_section[headings_present[-1]] = text_list[heading_index[headings_present[-1]]+1:]

	for i in range(len(headings_present)-1):
		data_in_each_section[headings_present[i]] = text_list[heading_index[headings_present[i]]+1:heading_index[headings_present[i+1]]]
	return data_in_each_section

def get_name_fields(name_data):
	name = [name_data.pop(0).strip()]
	address = []; phone = []; email = []
	
	email_regex = r'.+@.+\..+'
	phone_regex = r'\+?(\d*-?)*\d*'

	for i,data in enumerate(name_data):
		if re.search(email_regex, data):
			email.append(data.strip())
		elif re.search(phone_regex, data):
			phone.append(data.strip())
		else:
			address.append(data.strip())

	if len(name_data) == 0:
		return {'name':'Not Found/Incorrect Format'}

	return {'name':name, 'address':address, 'email':email, 'phone':phone}


def get_years_from_span(span_string):
	years = [i.strip() for i in span_string.split('-')]
	return {'start_year':years[0], 'end_year':years[1]}

def get_data_without_junk(data):
	return [i.strip() for i in data if re.search(r'[A-Za-z]',i.strip())]

def get_descriptive_fields(data, heading):
	count = 1
	dict_ = dict()

	start_regex = r'^\s*\d{4}\s*-\s*\d{4}'
	description = ''
	found_at_least_one_date = False
	for i,d in enumerate(data):
		if re.search(start_regex, d.strip()):
			found_at_least_one_date = True
			if i!=0:
				if heading == 'education':
					years.update({'institute':location,'description':description})
				elif heading == 'experience':
					years.update({'company':location,'description':description})
				dict_[count] = years
				description = ''
				count+=1
			span = re.findall(start_regex, d.strip())[0].strip()
			years = get_years_from_span(span)
			location = d.split(':')[-1].strip()
			
		else:
			if description:
				description += ' '+d.strip()
			else:
				description = d.strip()
	if heading=='education':
		years.update({'institute':location,'description':description})
	else:
		years.update({'institute':location,'description':description})
	dict_[count] = years

	if len(data) == 0:
		return {heading:'Not Found/Incorrect Format'}
	elif not found_at_least_one_date:
		return {heading:'Incorrect Format'}
	return dict_

def get_list_fields(data, heading):
	count = 1
	dict_ = dict()

	if heading == 'Interests':
		data = ''.join(data)
		data = data.split(',')
		data = get_data_without_junk(data)
	else:
		data = get_data_without_junk(data)
	for d in data:
			dict_[count] = d
			count+=1

	if len(data) == 0:
		return {heading:'Not Found/Incorrect Format'}
	# since currently we want them in tuple format converting to tuple
	if heading == 'roles applying for':
		dict__ = {'roles_applied':tuple(dict_.values())}
	else:
		dict__ = {heading:tuple(dict_.values())}
	return dict__

def get_single_line_fields(data, heading):
	# since we expect only one line under this heading we will return only the first line
	if len(data)>1:
		return {heading:f'Incorrect Format only expected one line for {heading}'}
	if len(data)==0:
		return {heading: 'Not found/Incorrect Format'}
	return {heading:data[0]}

def extract(text_list):
	fields = dict()
	data_in_each_section = get_data_in_each_section(text_list)

	if not data_in_each_section:
		return {0:'None of the expected headings were found/ Incorrect Format'}

	for heading in data_in_each_section:
		heading = heading.strip().lower()
		if heading == 'name':
			name_fields = get_name_fields(data_in_each_section[heading])
			fields.update(name_fields)

		elif heading in ['experience', 'education']:
			d_fields = get_descriptive_fields(data_in_each_section[heading], heading)
			fields.update({heading:d_fields})

		elif heading in ['skills', 'interests', 'roles applying for']:
			l_fields = get_list_fields(data_in_each_section[heading], heading)
			fields.update({l_fields})

		elif heading.strip().lower() == 'current notice period':
			s_fields = get_single_line_fields(data_in_each_section[heading], heading)
			fields.update(s_fields)

	return fields


ALLOWED_EXTENSIONS = ['pdf', 'docx', 'txt']

def allowed_format(fname):
    return '.' in fname and fname.split('.')[-1] in ALLOWED_EXTENSIONS

sanit = {}
sanit['names'] = s.ascii_letters+s.digits+'_.'

def sanitize(unsanitized, typ):
    if unsanitized is None:
        return None
    try:
        return ''.join(c for c in unsanitized if c in sanit[typ])
    except:
        return "Invalid"

def process_resume(filename,file):
	fields = {}
	fname = sanitize(filename, 'names').lower()

	#with open("resume", "wb") as f:
	#	f.write(file)

	temp = sp.run(['file', '-b', '--mime-type', file], capture_output=True)
	temp = temp.stdout.decode()

	if 'application/pdf' in temp:
		text_list = txt_from_pdf(file)
	elif 'wordprocessing' in temp:
		text_list = txt_from_docx(file)
	elif 'text/plain' in temp:
		text_list = [str(i.strip(), 'UTF-8') for i in file.readlines()]
	else:
		return 0
	print(text_list)

	fields = extract(text_list)

	return fields