The Computer Language
Benchmarks Game

regex-redux Lua #2 program

source code

-- The Computer Language Benchmarks Game
-- http://benchmarksgame.alioth.debian.org/
-- regex-dna program contributed by Jim Roseborough
-- modified by Victor Tang
-- optimized & replaced inefficient use of gsub with gmatch
-- partitioned sequence to prevent extraneous redundant string copy
-- converted from regex-dna program
-- fixed by saito tanaka 

seq = io.read("*a")
ilen, seq = #seq, seq:gsub('>[^%c]*%c*', ''):gsub('%c+', '')
clen = #seq

local variants = { 'agggtaaa|tttaccct',
                   '[cgt]gggtaaa|tttaccc[acg]',
                   'a[act]ggtaaa|tttacc[agt]t',
                   'ag[act]gtaaa|tttac[agt]ct',
                   'agg[act]taaa|ttta[agt]cct',
                   'aggg[acg]aaa|ttt[cgt]ccct',
                   'agggt[cgt]aa|tt[acg]accct',
                   'agggta[cgt]a|t[acg]taccct',
                   'agggtaa[cgt]|[acg]ttaccct', }

            	-- illegal key names should by between [] like a={['@&!']=4}

local subst = { ['tHa[Nt]']='<4>', ['aND|caN|Ha[DS]|WaS']='<3>', ['a[NSt]|BY']='<2>', 
                ['<[^>]*>']='|', ['\\|[^|][^|]*\\|']='-' ,}

function countmatches(variant)
   local n = 0
   variant:gsub('([^|]+)|?', function(pattern)
      for _ in seq:gmatch(pattern) do n = n + 1 end
   end)
   return n
end

for _, p in ipairs(variants) do
   io.write( string.format('%s %d\n', p, countmatches(p)) )
end

function partitionstring(seq)
  local seg = math.floor( math.sqrt(#seq) )
  local seqtable = {}
  for nextstart = 1, #seq, seg do
    table.insert(seqtable, seq:sub(nextstart, nextstart + seg - 1))
  end
  return seqtable
end
function chunk_gsub(t, k, v)
  for i, p in ipairs(t) do
    t[i] = p:find(k) and p:gsub(k, v) or t[i]
  end
  return t
end

seq = partitionstring(seq)
for k, v in pairs(subst) do
  chunk_gsub(seq, k, v)
end
seq = table.concat(seq)
io.write(string.format('\n%d\n%d\n%d\n', ilen, clen, #seq))

    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Lua 5.3.4  Copyright (C) 1994-2017 Lua.org, PUC-Rio


Wed, 15 Nov 2017 02:53:52 GMT

COMMAND LINE:
/opt/src/lua-5.3.4/bin/lua  regexredux.lua-2.lua 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

13c13
< 499805
---
> 273927

PROGRAM OUTPUT:
agggtaaa|tttaccct 3
[cgt]gggtaaa|tttaccc[acg] 12
a[act]ggtaaa|tttacc[agt]t 43
ag[act]gtaaa|tttac[agt]ct 27
agg[act]taaa|ttta[agt]cct 58
aggg[acg]aaa|ttt[cgt]ccct 16
agggt[cgt]aa|tt[acg]accct 15
agggta[cgt]a|t[acg]taccct 18
agggtaa[cgt]|[acg]ttaccct 20

508411
500000
499805