UnboundLocalError (Local variable referenced before assignment) Django - django-templates

For this feature, I am going to decrypt a string of morse code, back to a sentence.
But the error prompts out as saying UnboundLocalError (local variable 'space' referenced before assignment)
I researched online, people use global to solve the problem, but it doesn't work on me and I am only doing this locally, so I don't want it to affect my code later.
Here is my view:
def decipher(request):
"""The Decipher Page"""
MORSE_CODE_DICT = {'A': '.-', 'B': '-...',
'C': '-.-.', 'D': '-..', 'E': '.',
'F': '..-.', 'G': '--.', 'H': '....',
'I': '..', 'J': '.---', 'K': '-.-',
'L': '.-..', 'M': '--', 'N': '-.',
'O': '---', 'P': '.--.', 'Q': '--.-',
'R': '.-.', 'S': '...', 'T': '-',
'U': '..-', 'V': '...-', 'W': '.--',
'X': '-..-', 'Y': '-.--', 'Z': '--..',
'1': '.----', '2': '..---', '3': '...--',
'4': '....-', '5': '.....', '6': '-....',
'7': '--...', '8': '---..', '9': '----.',
'0': '-----', ', ': '--..--', '.': '.-.-.-',
'?': '..--..', '/': '-..-.', '-': '-....-',
'(': '-.--.', ')': '-.--.-'}
def decrypt(message):
# extra space added at the end to access the
# last morse code
message += ' '
decipherMsg = ''
citext = ''
for letter in message:
# checks for space
if letter != ' ':
# counter to keep track of space
space = 0
# storing morse code of a single character
citext += letter
# in case of space
else:
# if i = 1 that indicates a new character
space += 1
# if i = 2 that indicates a new word
if space == 2:
# adding space to separate words
decipherMsg += ' '
else:
# accessing the keys using their values (reverse of encryption)
decipherMsg += list(MORSE_CODE_DICT.keys())[list(MORSE_CODE_DICT.values()).index(citext)]
citext = ''
return decipherMsg
val1 = request.GET.get('a1', '')
res = decrypt(val1)
return render(request, 'morse_logs/decipher.html', {'result': res})
My html:
{% block content %}
<h1>Decipher</h1>
<form action="" method="get" >
<textarea rows="10" cols="50" name='a1' ></textarea>
<textarea rows="10" cols="50" name='a2' > {{result}} </textarea>
<button type="submit" name="cipher">Cipher</button>
{% comment %}
<textarea rows="10" cols="50" name="a3" > {{result}} </textarea>
{% endcomment %}
</form>
{% endblock content %}

The reason this happens is because you use the space variable, before assigning it a value. This can happen if the first character of the message is for example a space.
Furthermore you better make a dictionary that maps in reverse, and perform a check that the citext contains at least one character:
MORSE_CODE_DICT_REV = {v: k for k, v in MORSE_CODE_DICT.items()}
def decrypt(message):
# extra space added at the end to access the
# last morse code
message += ' '
decipherMsg = ''
citext = ''
space = 0
for letter in message:
# checks for space
if letter != ' ':
# counter to keep track of space
space = 0
# storing morse code of a single character
citext += letter
# in case of space
else:
# if i = 1 that indicates a new character
space += 1
# if i = 2 that indicates a new word
if space == 2:
# adding space to separate words
decipherMsg += ' '
elif citext != '':
# accessing the keys using their values (reverse of encryption)
decipherMsg += MORSE_CODE_DICT_REV[citext]
citext = ''
return decipherMsg

Related

Vocab for LSA Topic Modelling returning letters rather than words

I am trying to topic model a list of descriptions using LSA. When I tokenize and then create a vocab from the descriptions, the vocab returns letters rather than words.
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•#'
custom_stopwords = ['author', 'book', 'books', 'story', 'stories', 'novel', 'series', 'collection', 'edition', 'volume', 'readers', 'reader', 'reprint', 'writer', 'writing']
final_stopword_list = custom_stopwords + my_stopwords
# cleaning master function
def clean_tokens(tokens):
tokens = (tokens).lower() # lower case
tokens = re.sub('['+my_punctuation + ']+', ' ', tokens) # strip punctuation
tokens = re.sub('([0-9]+)', '', tokens) # remove numbers
token_list = [word for word in tokens.split(' ') if word not in final_stopword_list] # remove stopwords
tokens = ' '.join(token_list)
return tokens
This is my tokenizer
count_vectoriser = CountVectorizer(tokenizer=clean_tokens)
bag_of_words = count_vectoriser.fit_transform(df.Description)
vocab = count_vectoriser.get_feature_names_out()
print(vocab[:10])
And my vocab, which returns
[' ' '#' '\\' 'a' 'b' 'c' 'd' 'e' 'f' 'g']
When I want it to give me words
I am tokenizing from a pandas dataframe so I don't know if that is altering the way I am tokenizing.

concatenate values in dataframe if a column has specific values and None or Null values

I have a dataframe with name+address/email information based on the type. Based on a type I want to concat name+address or name+email into a new column (concat_name) within the dataframe. Some of the types are null and are causing ambiguity errors. Identifying the nulls correctly in place is where I'm having trouble.
NULL = None
data = {
'Type': [NULL, 'MasterCard', 'Visa','Amex'],
'Name': ['Chris','John','Jill','Mary'],
'City': ['Tustin','Cleveland',NULL,NULL ],
'Email': [NULL,NULL,'jdoe#yahoo.com','mdoe#aol.com']
}
df_data = pd.DataFrame(data)
#Expected resulting df column:
df_data['concat_name'] = ['ChrisTustin', 'JohnCleveland','Jilljdoe#yahoo.com,'Marymdoe#aol.com']
Attempt one using booleans
if df_data['Type'].isnull() | df_data[df_data['Type'] == 'Mastercard':
df_data['concat_name'] = df_data['Name']+df_data['City']
if df_data[df_data['Type'] == 'Visa' | df_data[df_data['Type'] == 'Amex':
df_data['concat_name'] = df_data['Name']+df_data['Email']
else:
df_data['concat_name'] = 'Error'
Error
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Attempt two using np.where
df_data['concat_name'] = np.where((df_data['Type'].isna()|(df_data['Type']=='MasterCard'),df_data['Name']+df_data['City'],
np.where((df_data['Type']=="Visa")|(df_data['Type]=="Amex"),df_data['Name']+df_data['Email'], 'Error'
Error
ValueError: Length of values(2) does not match length of index(12000)
Does the following code solve your use case?
# == Imports needed ===========================
import pandas as pd
import numpy as np
# == Example Dataframe =========================
df_data = pd.DataFrame(
{
"Type": [None, "MasterCard", "Visa", "Amex"],
"Name": ["Chris", "John", "Jill", "Mary"],
"City": ["Tustin", "Cleveland", None, None],
"Email": [None, None, "jdoe#yahoo.com", "mdoe#aol.com"],
# Expected output:
"concat_name": [
"ChrisTustin",
"JohnCleveland",
"Jilljdoe#yahoo.com",
"Marymdoe#aol.com",
],
}
)
# == Solution Implementation ====================
df_data["concat_name2"] = np.where(
(df_data["Type"].isin(["MasterCard", pd.NA, None])),
df_data["Name"].astype(str).replace("None", "")
+ df_data["City"].astype(str).replace("None", ""),
np.where(
(df_data["Type"].isin(["Visa", "Amex"])),
df_data["Name"].astype(str).replace("None", "")
+ df_data["Email"].astype(str).replace("None", ""),
"Error",
),
)
# == Expected Output ============================
print(df_data)
# Prints:
# Type Name City Email concat_name concat_name2
# 0 None Chris Tustin None ChrisTustin ChrisTustin
# 1 MasterCard John Cleveland None JohnCleveland JohnCleveland
# 2 Visa Jill None jdoe#yahoo.com Jilljdoe#yahoo.com Jilljdoe#yahoo.com
# 3 Amex Mary None mdoe#aol.com Marymdoe#aol.com Marymdoe#aol.com
Notes
You might also consider simplifying the problem, by replacing the first condition (Type == 'MasterCard' or None) with the opposite of your second condition (Type == 'Visa' or 'Amex'):
df_data["concat_name2"] = np.where(
(~df_data["Type"].isin(["Visa", "Amex"])),
df_data["Name"].astype(str).replace("None", "")
+ df_data["City"].astype(str).replace("None", ""),
df_data["Name"].astype(str).replace("None", "")
+ df_data["Email"].astype(str).replace("None", "")
)
Additionally, if you are dealing with messy data, you can also improve the implementation by converting the Type column to lowercase, or uppercase. This makes your code also account for cases where you have values like "mastercard", or "Mastercard", etc.:
df_data["concat_name2"] = np.where(
(df_data["Type"].astype(str).str.lower().isin(["mastercard", pd.NA, None, "none"])),
df_data["Name"].astype(str).replace("None", "")
+ df_data["City"].astype(str).replace("None", ""),
np.where(
(df_data["Type"].astype(str).str.lower().isin(["visa", "amex"])),
df_data["Name"].astype(str).replace("None", "")
+ df_data["Email"].astype(str).replace("None", ""),
"Error",
),
)

Converting recursive solution to dynamic programming

Problem statement: find the number of "vowel only" strings that can be made from a given sequence of Morse code (the entire string must be used)
I have this current recursive solution. I want to speed up this algorithm to run in O(n) time. I know that I can define my array as S[j] = the maximum number of unique strings that can be created with access from 1 ... j. But I don't know where to go from there.
morsedict = {'A': '.-',
'E': '.',
'I': '..',
'O': '---',
'U': '..-'}
maxcombinations = 0
def countCombinations(codelist):
if len(codelist) is 0:
global maxcombinations
maxcombinations += 1
return
if codelist[0] in morsedict.values():
countCombinations(codelist[1:])
if len(codelist) >= 2 and codelist[:2] in morsedict.values():
countCombinations(codelist[2:])
if len(codelist) >= 3 and codelist[:3] in morsedict.values():
countCombinations(codelist[3:])
return
For future researchers here is the solution for conversion to a DP problem:
morsedict = {'A': '.-',
'E': '.',
'I': '..',
'O': '---',
'U': '..-'}
def countcombinations(codelist):
# Generate the DP array to match the size of the codeword
maxcombinations = [0] * (len(codelist))
# How many unique strings can I create with access to j elements: j = current index
# j = 0: access to nothing (1 because we need somewhere to start)
maxcombinations[0] = 1
# Brute force calculate the first case due to its simplicity
if codelist[0: 2] in morsedict.values():
maxcombinations[1] = 1
else:
maxcombinations[1] = 0
# For the rest of the indices, we look back in the DP array to see how good we can do given a certain length string
for i in range(1, len(codelist)):
firststr = codelist[i]
secondstr = codelist[(i - 1): i + 1]
thirdstr = codelist[(i - 2): i + 1]
if len(firststr) is 1 and firststr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 1]
if len(secondstr) is 2 and secondstr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 2]
if len(thirdstr) is 3 and thirdstr in morsedict.values():
maxcombinations[i] += maxcombinations[i - 3]
print(maxcombinations[-1])
if __name__ == "__main__":
input()
codelist = input()
countcombinations(codelist)

Multiple creations on Odoo/Openerp res.partner table fail when one by one succeed

Here is what I am trying to do, I have a CSV file that has some records that I want to import on my installation. The following code works when I put a return None statement on the end of the loop. (Which you will find commented out
Upon the click of a button this method will be executed in order to loop through all the rows on the CSV and depending on the conditions create a new res.partner record.
(this has been implemented via XML-RPC calls and it worked flawlessly.)
The problem is this: When I stop the execution after the import of one CSV row, everything works (the record on res.partner is created), when I leave it running with the loop no records are created on the table res.partner (I get no errors, no exceptions whatsoever. I even get the newly created record's id upon the invocation of res_partner_obj.create() ).
Note: You will find lots of irrelevant code to my problem (which is why the records on res.partner are not created). I just put it there for the sake of completeness)
def gms_import_test(self, cr, uid, vals, context=None, check=True):
start_time = time.time()
res_partner_obj = self.pool.get('res.partner')
sales_teams_obj = self.pool.get('crm.case.section')
res_partner_title_obj = self.pool.get('res.partner.title')
customer_kunden_obj = self.pool.get('customer.kundengruppe')
res_country_obj = self.pool.get('res.country')
account_account_obj = self.pool.get('account.account')
crm_phonecall_obj = self.pool.get('crm.phonecall')
account_account_type_obj = self.pool.get('account.account.type')
# sys.path[0] == module's path + /addons/gmsimport/+ file's name
with open(sys.path[0] + '/addons/gmsimport/' + '21.9.2015.try8.csv') as csv_file: # TODO THESE MUST CHANGE UPON DEPLOYMENT TO DIFFERENT MACHINES
csv_reader = csv.reader(csv_file, delimiter='~', quotechar='^')
# Get the teams
sales_team_direct_sales_ID = sales_teams_obj.search(cr, uid, [('name', '=', 'Direct Sales')])
sales_team_0_ID = sales_teams_obj.search(cr, uid, [('name', '=', '0')])
sales_team_1_ID = sales_teams_obj.search(cr, uid, [('name', '=', '1')])
sales_team_2_ID = sales_teams_obj.search(cr, uid, [('name', '=', '2')])
sales_team_3_ID = sales_teams_obj.search(cr, uid, [('name', '=', '3')])
sales_team_4_ID = sales_teams_obj.search(cr, uid, [('name', '=', '4')])
sales_team_5_ID = sales_teams_obj.search(cr, uid, [('name', '=', '5')])
sales_team_6_ID = sales_teams_obj.search(cr, uid, [('name', '=', '6')])
sales_team_7_ID = sales_teams_obj.search(cr, uid, [('name', '=', '7')])
sales_team_8_ID = sales_teams_obj.search(cr, uid, [('name', '=', '8')])
sales_team_9_ID = sales_teams_obj.search(cr, uid, [('name', '=', '9')])
# Search for the titles, create them if they do not exist
damen_und_herren_title_ID = res_partner_title_obj.search(cr, uid, [('name', '=', 'Sehr geehrte Damen und Herren')])
if not damen_und_herren_title_ID:
damen_und_herren_title_ID = res_partner_title_obj.create(cr, uid, {'name':'Sehr geehrte Damen und Herren'})
if type(damen_und_herren_title_ID) is list:
damen_und_herren_title_ID = damen_und_herren_title_ID[0]
frau_title_ID = res_partner_title_obj.search(cr, uid, [('name', '=', 'Sehr geehrte Frau')])
if not frau_title_ID:
frau_title_ID = res_partner_title_obj.create(cr, uid, {'name':'Sehr geehrte Frau'})
if type(frau_title_ID) is list:
frau_title_ID = frau_title_ID[0]
herr_title_ID = res_partner_title_obj.search(cr, uid, [('name', '=', 'Sehr geehrter Herr')])
if not herr_title_ID:
herr_title_ID = res_partner_title_obj.create(cr, uid, {'name':'Sehr geehrter Herr'})
if type(herr_title_ID) is list:
herr_title_ID = herr_title_ID[0]
account_type_id = account_account_type_obj.search(cr, uid, [('name', '=', 'Receivable')])
# Checking to see whether there exists the "1200 - Forderungen aus Lieferungen und Leistungen account"
forderungen_account = account_account_obj.search(cr, uid, [('code', '=', 1200)])
if type(forderungen_account) is list and len(forderungen_account) > 0:
forderungen_account = forderungen_account[0]
account_payable_ID = account_account_obj.search(cr, uid, [('code', '=', 'Kunden - Diverse1')])
if type(account_payable_ID) is list:
account_payable_ID = account_payable_ID[0]
next(csv_reader, None) # To skip header row.
row_counter = 2
empty_name_counter = 0
for row in csv_reader:
print 'PROCESS IN ROW: ' + str(row_counter)
final_customer_name = None
if len(str(row[15]).strip()) > 0:
# If Firma is not empty, customer's name == Firma + Zusatz
final_customer_name = row[15] + ' ' + row[64]
else:
# If Firma is empty, customer's name == Vorname + Name
final_customer_name = row[63] + ' ' + row[45]
if final_customer_name:
empty_name_counter += 1
logging.info("Customer name is " + str(final_customer_name))
# search for the customer's existance, if exists do not add her
customer_id = res_partner_obj.search(cr, uid, [('name', '=', final_customer_name)])
print 'Searching with customer name ' + final_customer_name
if not customer_id:
# Customer does not exist, only then make the calls
# Fields with relations that must be treated in a special way
# x_kundengruppe, country_id, name
print 'customer.kundengruppe ' + str(row[6])
x_kundengruppe_id = customer_kunden_obj.search(cr, uid, [('name', '=', row[6].decode('utf8'))])
if not x_kundengruppe_id:
# kundergruppe does not exist (create a record on the model customer.kundengruppe and store its id)
print 'Creating kundengruppe'
x_kundengruppe_id = customer_kunden_obj.create(cr, uid, {'name':row[6]})
country_id = []
if str(row[27]).strip():
if str(row[27]) == 'Great Britain':
country_id = res_country_obj.search(cr, uid, [('name', '=', 'United Kingdom')])
else:
country_id = res_country_obj.search(cr, uid, [('name', '=', row[27])])
mittarbeitergrupe = None
if row[16] == '0 bis 4':
mittarbeitergrupe = '0_4'
elif row[16] == '5 bis 9':
mittarbeitergrupe = '5_9'
elif row[16] == '10 bis 19':
mittarbeitergrupe = '10_19'
elif row[16] == '20 bis 34':
mittarbeitergrupe = '20_34'
elif row[16] == '35 bis 49':
mittarbeitergrupe = '35_49'
elif row[16] == '50 bis 99':
mittarbeitergrupe = '50_99'
elif row[16] == 'über 100':
mittarbeitergrupe = 'uber_100'
final_customer_number_list = []
final_customer_number = None
print row[10]
if len(row[10]) < 8:
# Get row[10] length. Subtract it from 8. That's how many zeros must be put. Create a new string with zeros in front and number postfixed
zeros = 8 - len(row[10])
# final_customer_number_list.append('\'')
for y in range(0, zeros):
final_customer_number_list.append(str(0))
final_customer_number_list.append(str(row[10]))
# final_customer_number_list.append('\'')
final_customer_number = ''.join(final_customer_number_list)
print 'Customer\'s number length < 8. Prefixing ' + str(zeros) + ' zeros'
print 'final x_customer_number: ' + str(final_customer_number)
else:
final_customer_number = str(row[10])
# Make the country_id from an array to a single int variable
if len(country_id) > 0:
print 'SETTING COUNTRY ID TO ' + str(country_id)
country_id = country_id[0]
if type(x_kundengruppe_id) is list and len(x_kundengruppe_id) > 0:
x_kundengruppe_id = x_kundengruppe_id[0]
comment_section = self.assemble_internal_log(row)
final_title = self.assemble_customer_title(row, damen_und_herren_title_ID, frau_title_ID, herr_title_ID)
# Check x_mitarbeiter
if len(row[2].strip()) == 0:
row[2] = 0
else:
row[2] = int(row[2])
fields_to_be_inserted = { 'x_kundengruppe':x_kundengruppe_id,
'x_customer_number':final_customer_number, # (customer number cannot be less than 8 digits)
'vat':row[12],
'email':row[14],
'name':final_customer_name,
'x_mittarbeitergruppe':mittarbeitergrupe,
'title':final_title,
'x_a_kunde':row[23],
'website':row[24],
'country_id':country_id,
'mobile':filter(lambda x: x.isdigit(), row[44]),
'city':row[46],
'zip':row[49],
'function':str(row[50]),
'street':row[57] + str(row[21]), # street and House No
'fax':filter(lambda x: x.isdigit(), row[59]),
'phone':filter(lambda x: x.isdigit(), row[60]),
'comment':comment_section,
'x_mitarbeiter':row[2],
'property_account_payable':account_payable_ID,
}
log_entries = []
# column_index_list = [67, 68, 76, 77, 85, 86, 94, 95, 103, 104, 112, 113, 121, 122, 130, 131, 139, 140, 148, 149, 157, 158, 166, 167, 175, 176, 184, 185, 193, 194, 202, 203, 211, 212, 220, 221, 229, 230, 238, 239, 247, 248]
column_index_list = [67, 76, 85, 94, 103, 112, 121, 130, 139, 148, 157, 166, 175, 184, 193, 202, 211, 220, 229, 238, 247]
# search through the CSV to find those fields that contain (log note/date) and put them in internal log
for x in column_index_list:
if len(row[x].strip()) > 0:
print 'Log entry found, adding to the list'
log_entries.append(row[x + 1] + '||||' + row[x])
if customer_id:
# Customer exists, do not add her
print 'Customer ' + final_customer_name + ' exists. We do not add this one.'
logging.info('Customer ' + final_customer_name + ' exists. We do not add this one.')
else:
try:
logging.info('Creating customer ' + str(final_customer_name) + ', ' + str(row[15]))
created_customer_id = res_partner_obj.create(cr, uid, fields_to_be_inserted)
print 'CREATED CUSTOMER: ' + str(final_customer_name) + ' ID: ' + str(created_customer_id)
sales_team_id = None
# If PLZ field is not empty and is a 5 digit number
if row[49] and str(row[49]).isdigit() and len(str(row[49])) == 5:
# Check the first digit and assign to it a Sales Team
if int(str(row[49])[0]) == 1:
sales_team_id = sales_team_1_ID
elif int(str(row[49])[0]) == 2:
sales_team_id = sales_team_2_ID
elif int(str(row[49])[0]) == 3:
sales_team_id = sales_team_3_ID
elif int(str(row[49])[0]) == 4:
sales_team_id = sales_team_4_ID
elif int(str(row[49])[0]) == 5:
sales_team_id = sales_team_5_ID
elif int(str(row[49])[0]) == 6:
sales_team_id = sales_team_6_ID
elif int(str(row[49])[0]) == 7:
sales_team_id = sales_team_7_ID
elif int(str(row[49])[0]) == 8:
sales_team_id = sales_team_8_ID
elif int(str(row[49])[0]) == 9:
sales_team_id = sales_team_9_ID
# If the PLZ field is not empty and is a 4 digit number
elif row[49] and str(row[49]).isdigit() and len(str(row[49])) == 4: # int(row[49]) >= 0 and int(row[49]) <= 9999:
sales_team_id = sales_team_0_ID
# Everything else goes to the Direct Sales team
else:
sales_team_id = sales_team_direct_sales_ID
if len(sales_team_id) > 0:
print 'SECTION ID: ' + str(sales_team_id)
res_partner_obj.write(cr, uid, created_customer_id, {'section_id':str(sales_team_id[0])})
# personal account for each customer
# Check if account exists for certain customer
account_id = account_account_obj.search(cr, uid, [('name', '=', final_customer_name)])
if not account_id:
print 'Creating and linking new account for customer ' + final_customer_name + ' CODE: ' + str(row[10])
# Creating account (parent_id can be empty. If so, do not change the parent_id field; let it be)
account_id = account_account_obj.create(cr, uid, {'code': row[10],
'name': final_customer_name,
'type':'receivable',
'user_type': account_type_id[0],
'parent_id': forderungen_account
})
# In any case assign the account to the customer
if type(account_id) is list:
account_id = account_id[0]
print 'ACCOUNT ID TO LINK' + str(account_id)
res_partner_obj.write(cr, uid, created_customer_id, {'property_account_receivable':account_id})
except Exception, e:
#pass
print '===> ERROR ' + str(e) # If error on account creation, no problem keep going
for log_entry in log_entries:
log = log_entry.split('||||', 2)
crm_phonecall_obj.create(cr, uid, {'date':log[0], 'name':log[1], 'state':'done', 'partner_id':created_customer_id})
if time.time() - start_time > 500:
print '500 secs passed'
return None
row_counter += 1
#return None

Python Lex-Yacc (PLY) Error recovery at the end of input

Problem
I am trying to implement an error tolerant parser using Python Lex-Yacc (PLY), but I have trouble using error recovery rules at the end of my input string.
How can I recover from an unexpected end of input?
Example
This example grammar produces strings of the form A END A END A END A END ...
Statement : Expressions
Expressions : Expression Expressions
|
Expression : A END
I want to perform an error recovery if the END Token was omitted, so stings like A A A END or A A A will be recognized by the parser.
My approach
I added an error recovery rule, which allows me to accept input like A A A END
Expression : A END
| A error
Which allows me to accept the following input:
A A A END
But if the last END token is omitted (A A A), I still get a syntax error and cannot recover.
Sample PLY code
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expression expressions'''
p[0] = [p[1]] + p[2]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)
I add it as a new answer (and do know it is too late for the bounty :-( ) because it is a very different approach. If we used flex, it would be much easier, since it has the notion of the <<EOF>> token that matches only at end of file. After thinking about that, I realized that it was very simple to add that functionality to PLY without any change to the original module by using a proxy around the lexer. And Python allows easy implementation of proxies thanks the the __getattr__ special method.
I just add
a new token EOF that will be send at end of file
a proxy around the token method of the lexer that on end of file returns the special EOF token on first pass and then the normal None
the eof token to end statement rule
And still reverse the rule expressions : expressions expression instead of expressions : expression expressions to allow immediate reduce
The code becomes :
from __future__ import print_function
# Tokens
tokens = ('A', 'END', 'EOF')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
orig_lexer = lex.lex()
class ProxyLexer(object):
def __init__(self, lexer, eoftoken):
self.end = False
self.lexer = lexer
self.eof = eoftoken
def token(self):
tok = self.lexer.token()
if tok is None:
if self.end :
self.end = False
else:
self.end = True
tok = lex.LexToken()
tok.type = self.eof
tok.value = None
tok.lexpos = self.lexer.lexpos
tok.lineno = self.lexer.lineno
# print ('custom', tok)
return tok
def __getattr__(self, name):
return getattr(self.lexer, name)
lexer = ProxyLexer(orig_lexer, 'EOF')
# Rules
def p_statement_expr(p):
'''statement : expressions EOF'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
parser = yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
parser.parse(s, lexer = lexer)
That way :
the original grammar is unchanged
the error recovery method remains stupidly simple and has no dependance on the remaining of the grammar
it can be easily extended to complex parsers
As you want to accept all elements, you can explicitely declare a rule for a A not followed by a END and use the fact that yacc and PLY friendly deal with ambiguous rules.
You can simply have a normal rule :
Expression : A END
and below a lower priority rule (as it comes later) that will issue a warning
Expression : A
That way, all A will be accepted, there won't be any syntax error, and the warning will be issued for any A not followed by a END including one at the end of the flow. In order to more easily find the offending A, I have added in the warning the position of the symbol in the flow.
Edit:
The script is modified to correctly deal with other syntax error (such as AENDENDAEND), and also to immediately reduce expressions by replacing expressions : expression expressions with expressions : expressions expression
Here is the modified script (tested in python 3.4 simply replacing raw_input with input):
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
def p_expressions_err(p):
'''expressions : expressions error'''
p[0] = p[1]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END'''
p[0] = 'A'
# add a separate rule BELOW previous one to display a warning
def p_expression_pharse_warn(p):
'''expression : A'''
print("Warning at absolute position %d (line %d)" % (p.lexpos(1), p.lineno(1)))
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)
Edit : the following is an incorrect attempt to avoid an additional rule : it is more complex and less efficient than the above version. Please see my conclusion below
Edit per comment :
I understand your point that you do not want to multiply grammar rules. It is possible to be fault tolerant, except for last token. If your last token is in error, it will not be followed by anything and will never be caught in rule expression : A error.
But here is a fault tolerant parser that keeps everything except last token if case of error on that one :
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
# print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
result.append(p[2])
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
global lasterr
print("Syntax error at '%s' (%d)" % (p.value, p.lexpos))
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = input('query > ') # use input() on Python 3
except EOFError:
break
result = []
yacc.parse(s)
print('Result', result)
The princip is to collate by expressions : expressions expression instead of expressions : expression expressions, and to keep all in a global variable.
With an input of A END A A END A A A END it gives
Result ['A', 'A', 'A', 'A', 'A', 'A']
and with : A END A A END A A A END , it gives
Result ['A', 'A', 'A', 'A', 'A']
(all tokens but the last)
With a true flex - bison solution, it would be possible to make use of the special <<EOF>> token that matches at end of input, to always have another token after the last one. Unfortunately, it is not implemented in PLY, and the only real solution is to introduce a rule that accepts alone A token. For a real parser, it also guarantees that you are actually processing the correct token : I used
def p_expression_pharse(p):
'''expression : A END'''
p[0] = 1 + p.lexpos(1)
# add a separate rule BELOW previous one to display a warning
def p_expression_pharse_warn(p):
'''expression : A'''
print("Warning at absolute position %d (line %d)" % (p.lexpos(1), p.lineno(1)))
p[0] = -1 - p.lexpos(1)
to uniquely identify tokens in resul string, and I get correct positions.
And ... the error processing is very simple ...
Discussion TL/DR :
I admit I missed the point of last token error recovery. It is because in all parsers I've seen in real use cases, the error recovery consisted in rejecting the part that was syntactically incorrect (and thus not directly useable) and re-synchonizing the parser on next correct group of token. In all what I have seen, if a partial sentence can be used, it must not be processed by the error recovery mechanizme but by a grammar rule, in which it is easy to describe the appropriate action.
If you just want to keep the offending input for later processing, I think it is not a problem of action depending of a syntax, and I would simply note the position of offending token, or at most note the position of last correctly analysed token (the end of a complete element), the begin of first error recovery token and say that what is between is incorrect.
But it would be much different than what is asked here ...
This works for all examples I could imagine
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
#
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expression expressions'''
p[0] = p[1] + p[2]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END'''
p[0] = ['A']
def p_expression_error(p):
'''expression : A error'''
p[0] = ['A']
if p[2] is not None:
p[0] += p[2]
def p_error(p):
if p is None:
print("Syntax error at EOI")
e = yacc.YaccSymbol()
e.type = 'error'
e.value = None
yacc.errok()
return e
elif p.type == 'error':
yacc.errok()
return
elif hasattr(p, 'value'):
print("Syntax error at '%s'" % p.value)
e = yacc.YaccSymbol()
e.type = 'error'
e.value = p.value
yacc.errok()
return e
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)