I need to parse out time tokens from a string where the tokens are optional. Samples given:
- tt-5d10h
- tt-5d10h30m
- tt-5d30m
- tt-10h30m
- tt-5d
- tt-10h
- tt-30m
How can I, in Python, parse this out preferably as the set (days, hours, minutes)?
I need to parse out time tokens from a string where the tokens are optional. Samples given:
How can I, in Python, parse this out preferably as the set (days, hours, minutes)?
This program returns three integers (days, hours, seconds) for each input:
import re
samples = ['tt-5d10h', 'tt-5d10h30m', 'tt-5d30m', 'tt-10h30m', 'tt-5d', 'tt-10h', 'tt-30m',]
def parse(text):
match = re.match('tt-(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)m)?', text)
values = [int(x) for x in match.groups(0)]
return values
for sample in samples:
print parse(sample)
Output:
[5, 10, 0]
[5, 10, 30]
[5, 0, 30]
[0, 10, 30]
[5, 0, 0]
[0, 10, 0]
[0, 0, 30]
>>> pattern = re.compile("tt-(\d+d)?(\d+h)?(\d+m)?")
>>> results = pattern.match("tt-5d10h")
>>> days, hours, minutes = results.groups()
>>> days, hours, minutes
('5d', '10h', None)
Similar to compie's answer, but making the end result nicer to deal with:
re.match('tt-(?:(?P<days>\d+)d)?(?:(?P<hours>\d+)h)?(?:(?P<minutes>\d+)m)?', text).groupdict()
Example:
>>> import re
>>> s = ['tt-5d10h', 'tt-5d10h30m', 'tt-5d30m', 'tt-10h30m', 'tt-5d', 'tt-10h', 'tt-30m']
>>> for text in s:
print(re.match('tt-(?:(?P<days>\d+)d)?(?:(?P<hours>\d+)h)?(?:(?P<minutes>\d+)m)?', text).groupdict())
{'hours': '10', 'minutes': None, 'days': '5'}
{'hours': '10', 'minutes': '30', 'days': '5'}
{'hours': None, 'minutes': '30', 'days': '5'}
{'hours': '10', 'minutes': '30', 'days': None}
{'hours': None, 'minutes': None, 'days': '5'}
{'hours': '10', 'minutes': None, 'days': None}
{'hours': None, 'minutes': '30', 'days': None}
If you want to substitute 0 for the left-out tokens instead, just use groupdict(0)
instead of groupdict()
.
By partition:
inputstring="""tt-5d10h
tt-5d10h30m
tt-5d30m
tt-10h30m
tt-5d
tt-10h
tt-30m
"""
separators=('d','h','m')
result=[]
for text in (item.lstrip('t-') for item in inputstring.splitlines()):
data=[]
for sep in separators:
d,found,text = text.partition(sep)
if found: data.append(int(d.rstrip(sep)))
else:
data.append(0)
text=d
result.append(data)
# show input and result
for respairs in zip(inputstring.splitlines(),result): print(respairs)
""" Output:
('tt-5d10h', [5, 10, 0])
('tt-5d10h30m', [5, 10, 30])
('tt-5d30m', [5, 0, 30])
('tt-10h30m', [0, 10, 30])
('tt-5d', [5, 0, 0])
('tt-10h', [0, 10, 0])
('tt-30m', [0, 0, 30])
"""
Here's a pyparsing approach to your problem:
tests = """tt-5d10h
tt-5d10h30m
tt-5d30m
tt-10h30m
tt-5d
tt-10h
tt-30m""".splitlines()
from pyparsing import Word,nums,Optional
integer = Word(nums).setParseAction(lambda t:int(t[0]))
timeFormat = "tt-" + (
Optional(integer("days") + "d") +
Optional(integer("hrs") + "h") +
Optional(integer("mins") + "m")
)
def normalizeTime(tokens):
return tuple(tokens[field] if field in tokens else 0
for field in "days hrs mins".split())
timeFormat.setParseAction(normalizeTime)
for test in tests:
print "%-12s ->" % test,
print "%d %02d:%02d" % timeFormat.parseString(test)[0]
Prints:
tt-5d10h -> 5 10:00
tt-5d10h30m -> 5 10:30
tt-5d30m -> 5 00:30
tt-10h30m -> 0 10:30
tt-5d -> 5 00:00
tt-10h -> 0 10:00
tt-30m -> 0 00:30
Or to preserve the named results:
def normalizeTime(tokens):
for field in "days hrs mins".split():
if field not in tokens:
tokens[field] = 0
timeFormat.setParseAction(normalizeTime)
for test in tests:
print "%-12s ->" % test,
print "%(days)d %(hrs)02d:%(mins)02d" % timeFormat.parseString(test)