import re from collections import OrderedDict import string as s import subprocess as sp from utils.process_pdf import txt_from_pdf from utils.process_docx import txt_from_docx # this headings are currently supported headings = ['experience', 'education', 'skills', 'interests', 'roles applying for', 'current notice period'] sanit = {} sanit['names'] = s.ascii_letters+s.digits+'_.' def sanitize(unsanitized, typ): if unsanitized is None: return None try: return ''.join(c for c in unsanitized if c in sanit[typ]) except: return "Invalid" def get_data_in_each_section(text_list): text_list = [i.strip() for i in text_list if i.strip()] data_in_each_section = {} heading_index = OrderedDict({h:-1 for h in headings}) for i in range(len(text_list)): if text_list[i].strip().lower() in heading_index: heading_index[text_list[i]] = i for key in list(heading_index.keys()): if heading_index[key] == -1: del heading_index[key] headings_present = list(heading_index.keys()) data_in_each_section['name'] = text_list[:heading_index[headings_present[0]]] data_in_each_section[headings_present[-1]] = text_list[heading_index[headings_present[-1]]+1:] for i in range(len(headings_present)-1): data_in_each_section[headings_present[i]] = text_list[heading_index[headings_present[i]]+1:heading_index[headings_present[i+1]]] return data_in_each_section def get_name_fields(name_data): name = [name_data.pop(0).strip()] address = []; phone = []; email = [] email_regex = r'.+@.+\..+' phone_regex = r'\+?(\d*-?)*\d*' for i,data in enumerate(name_data): if re.search(email_regex, data): email.append(data.strip()) elif re.search(phone_regex, data): phone.append(data.strip()) else: address.append(data.strip()) if len(name_data) == 0: return {'name':'Not Found/Incorrect Format'} return {'name':name, 'address':address, 'email':email, 'phone':phone} def get_years_from_span(span_string): years = [i.strip() for i in span_string.split('-')] return {'start_year':years[0], 'end_year':years[1]} def get_data_without_junk(data): return [i.strip() for i in data if re.search(r'[A-Za-z]',i.strip())] def get_descriptive_fields(data, heading): count = 1 dict_ = dict() start_regex = r'^\s*\d{4}\s*-\s*\d{4}' description = '' found_at_least_one_date = False for i,d in enumerate(data): if re.search(start_regex, d.strip()): found_at_least_one_date = True if i!=0: if heading == 'education': years.update({'institute':location,'description':description}) elif heading == 'experience': years.update({'company':location,'description':description}) dict_[count] = years description = '' count+=1 span = re.findall(start_regex, d.strip())[0].strip() years = get_years_from_span(span) location = d.split(':')[-1].strip() else: if description: description += ' '+d.strip() else: description = d.strip() if heading=='education': years.update({'institute':location,'description':description}) else: years.update({'institute':location,'description':description}) dict_[count] = years if len(data) == 0: return {heading:'Not Found/Incorrect Format'} elif not found_at_least_one_date: return {heading:'Incorrect Format'} return dict_ def get_list_fields(data, heading): count = 1 dict_ = dict() if heading == 'Interests': data = ''.join(data) data = data.split(',') data = get_data_without_junk(data) else: data = get_data_without_junk(data) for d in data: dict_[count] = d count+=1 if len(data) == 0: return {heading:'Not Found/Incorrect Format'} # since currently we want them in tuple format converting to tuple if heading == 'roles applying for': dict__ = {'roles_applied':tuple(dict_.values())} else: dict__ = {heading:tuple(dict_.values())} return dict__ def get_single_line_fields(data, heading): # since we expect only one line under this heading we will return only the first line if len(data)>1: return {heading:f'Incorrect Format only expected one line for {heading}'} if len(data)==0: return {heading: 'Not found/Incorrect Format'} return {heading:data[0]} def extract(text_list): fields = dict() data_in_each_section = get_data_in_each_section(text_list) if not data_in_each_section: return {0:'None of the expected headings were found/ Incorrect Format'} for heading in data_in_each_section: heading = heading.strip().lower() if heading == 'name': name_fields = get_name_fields(data_in_each_section[heading]) fields.update(name_fields) elif heading in ['experience', 'education']: d_fields = get_descriptive_fields(data_in_each_section[heading], heading) fields.update({heading:d_fields}) elif heading in ['skills', 'interests', 'roles applying for']: l_fields = get_list_fields(data_in_each_section[heading], heading) fields.update({l_fields}) elif heading.strip().lower() == 'current notice period': s_fields = get_single_line_fields(data_in_each_section[heading], heading) fields.update(s_fields) return fields ALLOWED_EXTENSIONS = ['pdf', 'docx', 'txt'] def allowed_format(fname): return '.' in fname and fname.split('.')[-1] in ALLOWED_EXTENSIONS sanit = {} sanit['names'] = s.ascii_letters+s.digits+'_.' def sanitize(unsanitized, typ): if unsanitized is None: return None try: return ''.join(c for c in unsanitized if c in sanit[typ]) except: return "Invalid" def process_resume(filename,file): fields = {} fname = sanitize(filename, 'names').lower() #with open("resume", "wb") as f: # f.write(file) temp = sp.run(['file', '-b', '--mime-type', file], capture_output=True) temp = temp.stdout.decode() if 'application/pdf' in temp: text_list = txt_from_pdf(file) elif 'wordprocessing' in temp: text_list = txt_from_docx(file) elif 'text/plain' in temp: text_list = [str(i.strip(), 'UTF-8') for i in file.readlines()] else: return 0 print(text_list) fields = extract(text_list) return fields