The Computer Language
Benchmarks Game

regex-redux Java #4 program

source code

/*
   The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/

   regex-dna program contributed by Razii, 
      idea taken from Elliott Hughes and Roger Millington
   converted from regex-dna program
*/

import java.io.*;
import java.util.regex.*;
import java.util.*;

public final class regexredux {

   private static final Map<String, String> replacements = new HashMap<String, String>();

   static {

      replacements.put("tHa[Nt]", "<4>");
      replacements.put("aND|caN|Ha[DS]|WaS", "<3>");
      replacements.put("a[NSt]|BY", "<2>");
      replacements.put("<[^>]*>", "|");
      replacements.put("\\|[^|][^|]*\\|", "-");
   }

   static abstract class Rewriter {
      private Pattern pattern;
      private Matcher matcher;

      public Rewriter(String regularExpression) {

         this.pattern = Pattern.compile(regularExpression);
      }

      public String group(int i) {
         return matcher.group(i);
      }

      public abstract String replacement();

      public String rewrite(CharSequence original) {
         return rewrite(original, new StringBuffer(original.length())).toString();
      }

      public StringBuffer rewrite(CharSequence original, StringBuffer destination) {
         this.matcher = pattern.matcher(original);
         while (matcher.find()) {
            matcher.appendReplacement(destination, "");
            destination.append(replacement());
         }
         matcher.appendTail(destination);
         return destination;
      }
   }

   public static void main(String[] args)
   throws IOException {

      Reader r = new InputStreamReader(System.in, "ISO-8859-1");
      StringBuilder sb = new StringBuilder(5100000);
      char[] cbuf = new char[16384];
      int charsRead;
      while ((charsRead = r.read(cbuf)) != -1)
         sb.append(cbuf, 0, charsRead);

      int initialLength = sb.length();

      String sequence = new Rewriter(">.*\n|\n") {

         public String replacement() {
            return "";
         }
      }.rewrite(sb);


      int codeLength = sequence.length();

      String[] variants = { "agggtaaa|tttaccct" ,
                       "[cgt]gggtaaa|tttaccc[acg]",
                       "a[act]ggtaaa|tttacc[agt]t",
                       "ag[act]gtaaa|tttac[agt]ct",
                       "agg[act]taaa|ttta[agt]cct",
                       "aggg[acg]aaa|ttt[cgt]ccct",
                       "agggt[cgt]aa|tt[acg]accct",
                       "agggta[cgt]a|t[acg]taccct",
                       "agggtaa[cgt]|[acg]ttaccct"
                     };

      for (String variant : variants) {

         int count = 0;
         Matcher m = Pattern.compile(variant).matcher(sequence);
         while (m.find())
            count++;
         System.out.println(variant + " " + count);
      }

      sequence = new Rewriter("[WYKMSRBDVHN]") {

         public String replacement() {
            return replacements.get(group(0));
         }
      }.rewrite(sequence);

      System.out.println();
      System.out.println(initialLength);
      System.out.println(codeLength);
      System.out.println(sequence.length());

   }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
java 10 2018-03-20
Java(TM) SE Runtime Environment 18.3 (build 10+46)
Java HotSpot(TM) 64-Bit Server VM 18.3 (build 10+46, mixed mode)


Wed, 21 Mar 2018 18:32:49 GMT

MAKE:
mv regexredux.java-4.java regexredux.java
/opt/src/jdk-10/bin/javac -d .  regexredux.java

1.15s to complete and log all make actions

COMMAND LINE:
/opt/src/jdk-10/bin/java   regexredux 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

13c13
< 599189
---
> 273927

PROGRAM OUTPUT:
agggtaaa|tttaccct 3
[cgt]gggtaaa|tttaccc[acg] 12
a[act]ggtaaa|tttacc[agt]t 43
ag[act]gtaaa|tttac[agt]ct 27
agg[act]taaa|ttta[agt]cct 58
aggg[acg]aaa|ttt[cgt]ccct 16
agggt[cgt]aa|tt[acg]accct 15
agggta[cgt]a|t[acg]taccct 18
agggtaa[cgt]|[acg]ttaccct 20

508411
500000
599189