The Computer Language
Benchmarks Game

k-nucleotide C# .NET Core #5 program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/
 *
 * contributed by Sergey Prytkov
 * shifting arithmetic borrowed from Anthony Lloyd's submission
 * 
 */

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading;

namespace knucleotide
{
    class Program
    {
        private static readonly (string, Dictionary<long, Wrapper>, string)[] dics = {
            ("0", new Dictionary<long, Wrapper>(), (string)null),
            ("00", new Dictionary<long, Wrapper>(), (string)null),
            ("GGT", new Dictionary<long, Wrapper>(), (string)null),
            ("GGTA", new Dictionary<long, Wrapper>(), (string)null),
            ("GGTATT", new Dictionary<long, Wrapper>(), (string)null),
            ("GGTATTTTAATT", new Dictionary<long, Wrapper>(), (string)null),
            ("GGTATTTTAATTTATAGT", new Dictionary<long, Wrapper>(), (string)null),
        };
        private const byte a = (byte) 'a';
        private const byte A = (byte) 'A';
        private const byte c = (byte) 'c';
        private const byte C = (byte) 'C';
        private const byte g = (byte) 'g';
        private const byte G = (byte) 'G';
        private const byte T = (byte) 'T';
        private const byte t = (byte) 't';
        private const byte newLine = (byte) '\n';

        private static readonly List<byte[]> memory = new List<byte[]>();
        private static int start = 0;
        private static int end = -1;
        private static readonly VtComparer vt = new VtComparer();

        private static int work=-1;
        private const int WORK_COUNT=6;

        private static void Worker()
        {
            while (true)
            {
                var currentWork = Interlocked.Increment(ref work);
                if (currentWork > WORK_COUNT) return;
                var (f, d, _) = dics[currentWork];
                var fLen = f.Length;
                ref var ss = ref dics[currentWork].Item3;
                CountFrequency(fLen, d);
                if (fLen <= 2)
                    ss = WriteFrequencies(d, fLen);
                else
                    ss = WriteCount(d, f);
                if (currentWork == WORK_COUNT) return;
            }
        }

        private static void RunWorkers()
        {
            var worker = new ThreadStart(Worker);
            var workers = new Thread[Environment.ProcessorCount];
            for (int i = 0; i < workers.Length; i++)
            {
                workers[i] = new Thread(worker);
                workers[i].Start();
            }
            for (int i = 0; i < workers.Length; i++)
            {
                workers[i].Join();
            }
        }

        static void Main(string[] args)
        {
            ReadInput();
            RunWorkers();
            Console.WriteLine(dics[0].Item3);
            Console.WriteLine(dics[1].Item3);
            Console.WriteLine(dics[2].Item3);
            Console.WriteLine(dics[3].Item3);
            Console.WriteLine(dics[4].Item3);
            Console.WriteLine(dics[5].Item3);
            Console.WriteLine(dics[6].Item3);
        }

        private class VtComparer : IComparer<(int, long)>
        {
            public int Compare((int, long) x, (int, long) y)
            {
                var a = y.Item1 - x.Item1;
                if (a == 0) return (int) (x.Item2 - y.Item2);
                return a;
            }
        }

        private static string WriteFrequencies(Dictionary<long, Wrapper> freq, int fragmentLength)
        {
            var sb = new StringBuilder();
            var res = new (int, long)[freq.Count];
            var total = 0.0d;
            var i = 0;
            foreach (var kv in freq)
            {
                total += kv.Value.V;
                res[i++] = (kv.Value.V, kv.Key);
            }
            double percent = 100.0 / total;
            Array.Sort(res, vt);

            foreach (var (val, k) in res)
            {
                var key = k;
                if (fragmentLength == 2)
                {
                    var b = ToChar((int) (key & 0x3));
                    key >>= 2;
                    var a = ToChar((int) (key & 0x3));
                    sb.Append(a);
                    sb.Append(b);
                }
                else
                {
                    sb.Append(ToChar((int) (key & 0x3)));
                }
                sb.Append(' ');
                sb.AppendLine((val * percent).ToString("F3"));
            }
            return sb.ToString();
        }

        private static string WriteCount(Dictionary<long, Wrapper> dictionary, string fragment)
        {
            long key = 0;
            for (int i = 0; i < fragment.Length; ++i)
                key = (key << 2) | ToNum((byte) fragment[i]);
            Wrapper w;
            var n = dictionary.TryGetValue(key, out w) ? w.V : 0;
            return String.Concat(n.ToString(), "\t", fragment);
        }

        private static void CountFrequency(int fragmentLength, Dictionary<long, Wrapper> dictionary)
        {
            long rollingKey = 0;
            long mask = 0;
            int cursor;
            var memoryI = 0;
            var localStart = start;
            for (cursor = 0; cursor < fragmentLength - 1; cursor++)
            {
                rollingKey <<= 2;
                if (memory[memoryI].Length <= localStart + cursor)
                {
                    memoryI++;
                    localStart = 0;
                    cursor = 0;
                }
                rollingKey |= ToNum(memory[memoryI][localStart + cursor]);
                mask = (mask << 2) + 3;
            }
            localStart += cursor;
            mask = (mask << 2) + 3;
            Wrapper w;
            byte cursorByte;
            for (int i = memoryI; i < memory.Count; i++)
            {
                var buffer = memory[i];
                for (int j = i == 0 ? localStart : 0; j < buffer.Length; j++)
                {
                    if (i == memory.Count - 1 && j == end - 1) break;
                    if ((cursorByte = buffer[j]) == newLine) continue;
                    rollingKey = ((rollingKey << 2) & mask) | ToNum(cursorByte);
                    if (dictionary.TryGetValue(rollingKey, out w))
                        w.V++;
                    else
                        dictionary.Add(rollingKey, new Wrapper(1));
                }
            }
        }

        private class Wrapper
        {
            public int V;

            public Wrapper(int v)
            {
                V = v;
            }
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static byte ToNum(byte b)
        {
            Debug.Assert(new []{'A','C','G','T','a','c','g','t'}.Contains((char) b));
            return (byte)((b & 6) >> 1);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private static char ToChar(int i)
        {
            switch (i)
            {
                case 0:
                    return 'A';
                case 1:
                    return 'C';
                case 2:
                    return 'T';
                default:
                    return 'G';
            }
        }

        private static void ReadInput()
        {
            var bufferSize = 32 * 1024 * 8;
            var buffer = new byte[bufferSize];
            var stdin = Console.OpenStandardInput();
            var gti = -1;
            var gt = (byte) '>';
            var T = (byte) 'T';
            var H = (byte) 'H';
            var R = (byte) 'R';
            var E = (byte) 'E';
            int readen;
            bool headerFound = false;
            byte[] helpMe = null;
            // I need refactor this mess
            while ((readen = stdin.Read(buffer, 0, bufferSize)) > 0)
            {
                // looking for '>'
                while (!headerFound && (gti = Array.IndexOf(buffer, gt, gti + 1, readen - (gti + 1))) != -1)
                {
                    // if '>THREE' fits buffer with no overlap (very likely)
                    if (gti < readen - 6)
                    {
                        if (buffer[gti + 1] == T &&
                            buffer[gti + 2] == H &&
                            buffer[gti + 3] == R &&
                            buffer[gti + 4] == E &&
                            buffer[gti + 5] == E)
                        {
                            headerFound = true;
                            break;
                        }
                    }
                    else
                    {
                        helpMe = new byte[5];
                        Buffer.BlockCopy(buffer, gti, helpMe, 0, readen - gti);
                        stdin.Read(helpMe, readen - gti, helpMe.Length - (readen - gti));
                        if (helpMe[0] == T &&
                            helpMe[1] == H &&
                            helpMe[2] == R &&
                            helpMe[3] == E &&
                            helpMe[4] == E)
                        {
                            headerFound = true;
                            break;
                        }
                    }
                }
                if (headerFound)
                {
                    // if new line after header also in this buff (very likely)
                    if ((start = Array.IndexOf(buffer, newLine, gti + 6, readen - (gti + 6))) != -1)
                    {
                        memory.Add(buffer);
                        end = Array.IndexOf(buffer, gt, start, readen - start);
                        if (readen < bufferSize)
                        {
                            end = readen;
                            goto exit;
                        }
                        else
                        {
                            buffer = new byte[bufferSize];
                            goto second_loop;
                        }
                    }
                }
            }
            second_loop:
            while (end == -1 && ((readen = stdin.Read(buffer, 0, bufferSize)) > 0))
            {
                end = Array.IndexOf(buffer, gt, 0, readen);
                memory.Add(buffer);
                buffer = new byte[bufferSize];
                if (readen != bufferSize) break;
            }
            if (end == -1) end = readen;
            exit:
            start++;
            stdin.Dispose();
            GC.Collect();
        }
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
2.0.2 a04b4bf512
"System.GC.Server": true


Fri, 27 Oct 2017 00:08:10 GMT

MAKE:
cp knucleotide.csharpcore-5.csharpcore Program.cs
cp Include/csharpcore/tmp.csproj .
cp Include/csharpcore/runtimeconfig.template.json .
mkdir obj
cp Include/csharpcore/tmp.csproj.nuget.g.props ./obj
cp Include/csharpcore/tmp.csproj.nuget.g.targets ./obj
/usr/bin/dotnet build -c Release
Microsoft (R) Build Engine version 15.4.8.50001 for .NET Core
Copyright (C) Microsoft Corporation. All rights reserved.

  tmp -> /home/dunham/benchmarksgame_quadcore/knucleotide/tmp/bin/Release/netcoreapp2.0/tmp.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:03.83

6.41s to complete and log all make actions

COMMAND LINE:
/usr/bin/dotnet ./bin/Release/netcoreapp2.0/tmp.dll 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT