How can I create the grammar definition to correctly parse a input - yacc

Lex file
import ply.lex as lex
# List of token names.
tokens = (
"SYMBOL",
"COUNT"
)
t_SYMBOL = (r"Cl|Ca|Co|Os|C|H|O")
def t_COUNT(t):
r"\d+"
t.value = int(t.value)
return t
def t_error(t):
raise TypeError("Unknown text '%s'" % (t.value,))
atomLexer = lex.lex()
data1 = "CH3Cl"
data = "OClOsOH3C"
def testItOut():
# Give the lexer some input
atomLexer.input(data1)
# Tokenize
tok = atomLexer.token()
while tok:
print (tok)
tok = atomLexer.token()
Parse file
import ply.yacc as yacc
# Get the token map from the lexer.
from atomLex import tokens
def p_expression_symbol(p):
'molecule : SYMBOL'
p[0] = p[1]
def p_error(p):
raise TypeError("unknown text at %r" % (p.value,))
atomParser = yacc.yacc()
def testItOut():
# Give the parser some input
s = input('Type a chemical name > ')
# Parse it
result = atomParser.parse(s)
print ('The atom is: ' + result)
while(True):
testItOut()
Currently I would like to be able to enter in CH3Cl, although within my parse file I am not entirely sure how to create these grammar definitions that I have been given,
chemical : chemical molecule
chemical : molecule
molecule : SYMBOL COUNT
molecule : SYMBOL
What would the grammar definitions for these be within the parse file? Thank you.

There is a nice set of documentation for PLY with examples, which can be used to answer this question: http://www.dabeaz.com/ply/ply.html
Section 6.2 is particularly helpful. I suggest you change this code:
def p_expression_symbol(p):
'molecule : SYMBOL'
p[0] = p[1]
To include the new rules. The name p_expression_symbol is also inappropriate. I guess you copied that from one of the examples. We now have:
def p_chemical_forumal(p):
'''molecule : SYMBOL
chemical : chemical molecule
chemical : molecule
molecule : SYMBOL COUNT
molecule : SYMBOL'''
p[0] = p[1]
There are also other useful examples in the documentation that can be applied to your exercise.

Related

Cannot replace spaCy lemmatized pronouns (-PRON-) through text

I'm trying to lemmatise a text with spaCy. Since spaCy uses -PRON- as lemma for personal pronouns, I want to keep the original text in all those cases.
Here's the relevant section of my code:
...
fout = open('test.txt', 'w+')
doc = nlp(text)
for word in doc:
if word.lemma_ == "-PRON-":
write = word.text
print(write)
else:
write = word.lemma_
fout.write(str(write))
fout.write(" ")
...
The print statement does print the original words for the cases where spaCy attributes the lemma '-PRON-'.
However, my output file (test.txt) always contains '-PRON-' for those cases, even though I would expect it to write the original words for those cases (I, us etc.)
What am I missing?
I tried different versions, including using the pos_ tag to identify the pronouns etc. but always with the same result, i.e., that my output contains '-PRON-'s
Try this somewhat altered code snipped to see what you get...
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Did he write the code for her?'
doc = nlp(text)
out_sent = [w.lemma_ if w.lemma_ !='-PRON-' else w.text for w in doc]
out_sent = ' '.join(out_sent)
print(out_sent)
with open('out_sent.txt', 'w') as f:
f.write(out_sent + '\n')
This should produce...
do he write the code for her ?

How to create my own handwriting data set like IAM dataset

I need to create my own handwritten character dataset the format is just like the Iam Handwriting Database. I don't know how to create the dataset just like that, and I need you can check the data set format from their site I need to create data/ascii/words.txt and data/words/
There isn't instruction for creating IAM Handwriting Database. But you can find here: Build a Handwritten Text Recognition System using TensorFlow.
import os
import numpy as np
import cv2
class DataProvider():
"this class creates machine-written text for a word list. TODO: change getNext() to return your samples."
def __init__(self, wordList):
self.wordList = wordList
self.idx = 0
def hasNext(self):
return self.idx < len(self.wordList)
def getNext(self):
img = np.ones((32, 128), np.uint8)*255
word = self.wordList[self.idx]
self.idx += 1
cv2.putText(img,word,(2,20), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0), 1, cv2.LINE_AA)
return (word, img)
def createIAMCompatibleDataset(dataProvider):
"this function converts the passed dataset to an IAM compatible dataset"
# create files and directories
f = open('words.txt', 'w+')
if not os.path.exists('sub'):
os.makedirs('sub')
if not os.path.exists('sub/sub-sub'):
os.makedirs('sub/sub-sub')
# go through data and convert it to IAM format
ctr = 0
while dataProvider.hasNext():
sample = dataProvider.getNext()
# write img
cv2.imwrite('sub/sub-sub/sub-sub-%d.png'%ctr, sample[1])
# write filename, dummy-values and text
line = 'sub-sub-%d'%ctr + ' X X X X X X X ' + sample[0] + '\n'
f.write(line)
ctr += 1
if __name__ == '__main__':
words = ['some', 'words', 'for', 'which', 'we', 'create', 'text-images']
dataProvider = DataProvider(words)
createIAMCompatibleDataset(dataProvider)
The source code made by Harald Scheidl.

how to make R datafile to Python type

I want to make R datatype to Python datatype below is the whole code
def convert_datafiles(datasets_folder):
import rpy2.robjects
rpy2.robjects.numpy2ri.activate()
pandas2ri.activate()
for root, dirs, files in os.walk(datasets_folder):
for name in files:
# sort out .RData files
if name.endswith('.RData'):
name_ = os.path.splitext(name)[0]
name_path = os.path.join(datasets_folder, name_)
# creat sub-directory
if not os.path.exists(name_path):
os.makedirs(name_path)
file_path = os.path.join(root, name)
robj = robjects.r.load(file_path)
# check out subfiles in the data frame
for var in robj:
###### error happend right here
myRData = pandas2ri.ri2py_dataframe( var )
####error happend right here
# convert to DataFrame
if not isinstance(myRData, pd.DataFrame):
myRData = pd.DataFrame(myRData)
var_path = os.path.join(datasets_folder,name_,var+'.csv')
myRData.to_csv(var_path)
os.remove(os.path.join(datasets_folder, name)) # clean up
print ("=> Success!")
I want to make R datatype to pythone type, but the error keeps popping up like this : AttributeError: 'str' object has no attribute 'dtype'
How should I do to resolve this error?
The rpy2 documentation is somewhat incomplete when it comes to interaction with pandas, but unit tests will provide examples of conversion. For example:
rdataf = robjects.r('data.frame(a=1:2, '
' b=I(c("a", "b")), '
' c=c("a", "b"))')
with localconverter(default_converter + rpyp.converter) as cv:
pandas_df = robjects.conversion.ri2py(rdataf)

Python Lex-Yacc (PLY) Error recovery at the end of input

Problem
I am trying to implement an error tolerant parser using Python Lex-Yacc (PLY), but I have trouble using error recovery rules at the end of my input string.
How can I recover from an unexpected end of input?
Example
This example grammar produces strings of the form A END A END A END A END ...
Statement : Expressions
Expressions : Expression Expressions
|
Expression : A END
I want to perform an error recovery if the END Token was omitted, so stings like A A A END or A A A will be recognized by the parser.
My approach
I added an error recovery rule, which allows me to accept input like A A A END
Expression : A END
| A error
Which allows me to accept the following input:
A A A END
But if the last END token is omitted (A A A), I still get a syntax error and cannot recover.
Sample PLY code
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expression expressions'''
p[0] = [p[1]] + p[2]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)
I add it as a new answer (and do know it is too late for the bounty :-( ) because it is a very different approach. If we used flex, it would be much easier, since it has the notion of the <<EOF>> token that matches only at end of file. After thinking about that, I realized that it was very simple to add that functionality to PLY without any change to the original module by using a proxy around the lexer. And Python allows easy implementation of proxies thanks the the __getattr__ special method.
I just add
a new token EOF that will be send at end of file
a proxy around the token method of the lexer that on end of file returns the special EOF token on first pass and then the normal None
the eof token to end statement rule
And still reverse the rule expressions : expressions expression instead of expressions : expression expressions to allow immediate reduce
The code becomes :
from __future__ import print_function
# Tokens
tokens = ('A', 'END', 'EOF')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
orig_lexer = lex.lex()
class ProxyLexer(object):
def __init__(self, lexer, eoftoken):
self.end = False
self.lexer = lexer
self.eof = eoftoken
def token(self):
tok = self.lexer.token()
if tok is None:
if self.end :
self.end = False
else:
self.end = True
tok = lex.LexToken()
tok.type = self.eof
tok.value = None
tok.lexpos = self.lexer.lexpos
tok.lineno = self.lexer.lineno
# print ('custom', tok)
return tok
def __getattr__(self, name):
return getattr(self.lexer, name)
lexer = ProxyLexer(orig_lexer, 'EOF')
# Rules
def p_statement_expr(p):
'''statement : expressions EOF'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
parser = yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
parser.parse(s, lexer = lexer)
That way :
the original grammar is unchanged
the error recovery method remains stupidly simple and has no dependance on the remaining of the grammar
it can be easily extended to complex parsers
As you want to accept all elements, you can explicitely declare a rule for a A not followed by a END and use the fact that yacc and PLY friendly deal with ambiguous rules.
You can simply have a normal rule :
Expression : A END
and below a lower priority rule (as it comes later) that will issue a warning
Expression : A
That way, all A will be accepted, there won't be any syntax error, and the warning will be issued for any A not followed by a END including one at the end of the flow. In order to more easily find the offending A, I have added in the warning the position of the symbol in the flow.
Edit:
The script is modified to correctly deal with other syntax error (such as AENDENDAEND), and also to immediately reduce expressions by replacing expressions : expression expressions with expressions : expressions expression
Here is the modified script (tested in python 3.4 simply replacing raw_input with input):
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
def p_expressions_err(p):
'''expressions : expressions error'''
p[0] = p[1]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END'''
p[0] = 'A'
# add a separate rule BELOW previous one to display a warning
def p_expression_pharse_warn(p):
'''expression : A'''
print("Warning at absolute position %d (line %d)" % (p.lexpos(1), p.lineno(1)))
p[0] = 'A'
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)
Edit : the following is an incorrect attempt to avoid an additional rule : it is more complex and less efficient than the above version. Please see my conclusion below
Edit per comment :
I understand your point that you do not want to multiply grammar rules. It is possible to be fault tolerant, except for last token. If your last token is in error, it will not be followed by anything and will never be caught in rule expression : A error.
But here is a fault tolerant parser that keeps everything except last token if case of error on that one :
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
# print("parsed:", p[1])
def p_expressions(p):
'''expressions : expressions expression'''
p[0] = p[1] + [p[2]]
result.append(p[2])
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END
| A error'''
p[0] = 'A'
def p_error(p):
if p:
global lasterr
print("Syntax error at '%s' (%d)" % (p.value, p.lexpos))
else:
print("Syntax error at EOI")
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = input('query > ') # use input() on Python 3
except EOFError:
break
result = []
yacc.parse(s)
print('Result', result)
The princip is to collate by expressions : expressions expression instead of expressions : expression expressions, and to keep all in a global variable.
With an input of A END A A END A A A END it gives
Result ['A', 'A', 'A', 'A', 'A', 'A']
and with : A END A A END A A A END , it gives
Result ['A', 'A', 'A', 'A', 'A']
(all tokens but the last)
With a true flex - bison solution, it would be possible to make use of the special <<EOF>> token that matches at end of input, to always have another token after the last one. Unfortunately, it is not implemented in PLY, and the only real solution is to introduce a rule that accepts alone A token. For a real parser, it also guarantees that you are actually processing the correct token : I used
def p_expression_pharse(p):
'''expression : A END'''
p[0] = 1 + p.lexpos(1)
# add a separate rule BELOW previous one to display a warning
def p_expression_pharse_warn(p):
'''expression : A'''
print("Warning at absolute position %d (line %d)" % (p.lexpos(1), p.lineno(1)))
p[0] = -1 - p.lexpos(1)
to uniquely identify tokens in resul string, and I get correct positions.
And ... the error processing is very simple ...
Discussion TL/DR :
I admit I missed the point of last token error recovery. It is because in all parsers I've seen in real use cases, the error recovery consisted in rejecting the part that was syntactically incorrect (and thus not directly useable) and re-synchonizing the parser on next correct group of token. In all what I have seen, if a partial sentence can be used, it must not be processed by the error recovery mechanizme but by a grammar rule, in which it is easy to describe the appropriate action.
If you just want to keep the offending input for later processing, I think it is not a problem of action depending of a syntax, and I would simply note the position of offending token, or at most note the position of last correctly analysed token (the end of a complete element), the begin of first error recovery token and say that what is between is incorrect.
But it would be much different than what is asked here ...
This works for all examples I could imagine
from __future__ import print_function
# Tokens
tokens = ('A', 'END')
t_A = r'A'
t_END = r'END'
t_ignore = " "
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# Build the lexer
import ply.lex as lex
lex.lex()
# Rules
def p_statement_expr(p):
'''statement : expressions'''
#
print("parsed:", p[1])
def p_expressions(p):
'''expressions : expression expressions'''
p[0] = p[1] + p[2]
def p_expressions_empty(p):
'''expressions : '''
p[0] = list()
def p_expression_pharse(p):
'''expression : A END'''
p[0] = ['A']
def p_expression_error(p):
'''expression : A error'''
p[0] = ['A']
if p[2] is not None:
p[0] += p[2]
def p_error(p):
if p is None:
print("Syntax error at EOI")
e = yacc.YaccSymbol()
e.type = 'error'
e.value = None
yacc.errok()
return e
elif p.type == 'error':
yacc.errok()
return
elif hasattr(p, 'value'):
print("Syntax error at '%s'" % p.value)
e = yacc.YaccSymbol()
e.type = 'error'
e.value = p.value
yacc.errok()
return e
import ply.yacc as yacc
yacc.yacc()
while 1:
try:
s = raw_input('query > ') # use input() on Python 3
except EOFError:
break
yacc.parse(s)

Return multiple input (Python)

In python 3 I have a line asking for input that will then look in an imported dictionary and then list all their inputs that appear in the dictionary. My problem is when I run the code and put in the input it will only return the last word I input.
For example
the dictionary contains (AIR, AMA)
and if I input (AIR, AMA) it will only return AMA.
Any information to resolve this would be very helpful!
The dictionary:
EXCHANGE_DATA = [('AIA', 'Auckair', 1.50),
('AIR', 'Airnz', 5.60),
('AMP', 'Amp',3.22),
The Code:
import shares
a=input("Please input")
s1 = a.replace(' ' , "")
print ('Please list portfolio: ' + a)
print (" ")
n=["Code", "Name", "Price"]
print ('{0: <6}'.format(n[0]) + '{0:<20}'.format(n[1]) + '{0:>8}'.format(n[2]))
z = shares.EXCHANGE_DATA[0:][0]
b=s1.upper()
c=b.split()
f=shares.EXCHANGE_DATA
def find(f, a):
return [s for s in f if a.upper() in s]
x= (find(f, str(a)))
toDisplay = []
a = a.split()
for i in a:
temp = find(f, i)
if(temp):
toDisplay.append(temp)
for i in toDisplay:
print ('{0: <6}'.format(i[0][0]) + '{0:<20}'.format(i[0][1]) + ("{0:>8.2f}".format(i[0][2])))
Ok, the code seems somewhat confused. Here's a simpler version that seems to do what you want:
#!/usr/bin/env python3
EXCHANGE_DATA = [('AIA', 'Auckair', 1.50),
('AIR', 'Airnz', 5.60),
('AMP', 'Amp',3.22)]
user_input = input("Please Specify Shares: ")
names = set(user_input.upper().split())
print ('Listing the following shares: ' + str(names))
print (" ")
# Print header
n=["Code", "Name", "Price"]
print ('{0: <6}{1:<20}{2:>8}'.format(n[0],n[1],n[2]))
#print data
for i in [data for data in EXCHANGE_DATA if data[0] in names]:
print ('{0: <6}{1:<20}{2:>8}'.format(i[0],i[1],i[2]))
And here's an example of use:
➤ python3 program.py
Please Specify Shares: air amp
Listing the following shares: {'AMP', 'AIR'}
Code Name Price
AIR Airnz 5.6
AMP Amp 3.22
The code sample you provided actually does what was expected, if you gave it space separated quote names.
Hope this helps.