The Computer Language
Benchmarks Game

k-nucleotide C# .NET Core #8 program

source code

/* The Computer Language Benchmarks Game
   http://benchmarksgame.alioth.debian.org/
 
   submitted by Josh Goldfoot
   Modified to reduce memory and do more in parallel by Anthony Lloyd
 */

using System;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Runtime.CompilerServices;

class Wrapper { public int v=1; }
public static class KNucleotide
{
    const int BLOCK_SIZE = 1024 * 1024 * 8;
    static List<byte[]> threeBlocks = new List<byte[]>();
    static int threeStart, threeEnd;
    static byte[] tonum = new byte[256];
    static char[] tochar = new char[] {'A', 'C', 'G', 'T'};

    static int read(Stream stream, byte[] buffer, int offset, int count)
    {
        var bytesRead = stream.Read(buffer, offset, count);
        return bytesRead==count ? offset+count
             : bytesRead==0 ? offset
             : read(stream, buffer, offset+bytesRead, count-bytesRead);
    }
    
    static int find(byte[] buffer, byte[] toFind, int i, ref int matchIndex)
    {
        if(matchIndex==0)
        {
            i = Array.IndexOf(buffer, toFind[0], i);
            if(i==-1) return -1;
            matchIndex = 1;
            return find(buffer, toFind, i+1, ref matchIndex);
        }
        else
        {
            int bl = buffer.Length, fl = toFind.Length;
            while(i<bl && matchIndex<fl)
            {
                if(buffer[i++]!=toFind[matchIndex++])
                {
                    matchIndex = 0;
                    return find(buffer, toFind, i, ref matchIndex);
                }
            }
            return matchIndex==fl ? i : -1;
        }
    }

    static void loadThreeData()
    {
        var stream = Console.OpenStandardInput();
        
        // find three sequence
        int matchIndex = 0;
        var toFind = new [] {(byte)'>', (byte)'T', (byte)'H', (byte)'R', (byte)'E', (byte)'E'};
        var buffer = new byte[BLOCK_SIZE];
        do
        {
            threeEnd = read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = find(buffer, toFind, 0, ref matchIndex);
        } while (threeStart==-1);
        
        // Skip to end of line
        matchIndex = 0;
        toFind = new [] {(byte)'\n'};
        threeStart = find(buffer, toFind, threeStart, ref matchIndex);
        while(threeStart==-1)
        {
            threeEnd = read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = find(buffer, toFind, 0, ref matchIndex);
        }
        threeBlocks.Add(buffer);
        
        if(threeEnd!=BLOCK_SIZE) // Needs to be at least 2 blocks
        {
            var bytes = threeBlocks[0];
            for(int i=threeEnd; i<bytes.Length; i++)
                bytes[i] = 255;
            threeEnd = 0;
            threeBlocks.Add(Array.Empty<byte>());
            return;
        }

        // find next seq or end of input
        matchIndex = 0;
        toFind = new [] {(byte)'>'};
        threeEnd = find(buffer, toFind, threeStart, ref matchIndex);
        while(threeEnd==-1)
        {
            buffer = new byte[BLOCK_SIZE];
            var bytesRead = read(stream, buffer, 0, BLOCK_SIZE);
            threeEnd = bytesRead==BLOCK_SIZE ? find(buffer, toFind, 0, ref matchIndex)
                        : bytesRead;
            threeBlocks.Add(buffer);
        }

        if(threeStart+18>BLOCK_SIZE) // Key needs to be in the first block
        {
            byte[] block0 = threeBlocks[0], block1 = threeBlocks[1];
            Buffer.BlockCopy(block0, threeStart, block0, threeStart-18, BLOCK_SIZE-threeStart);
            Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE-18, 18);
            for(int i=0; i<18; i++) block1[i] = 255;
        }
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    static void check(Dictionary<long, Wrapper> dict, ref long rollingKey, byte nb, long mask)
    {
        if(nb==255) return;
        rollingKey = ((rollingKey & mask) << 2) | nb;
        Wrapper w;
        if (dict.TryGetValue(rollingKey, out w))
            w.v++;
        else
            dict[rollingKey] = new Wrapper();
    }

    static Task<string> count(int l, long mask, Func<Dictionary<long,Wrapper>,string> summary)
    {
        return Task.Run(() =>
        {
            long rollingKey = 0;
            var firstBlock = threeBlocks[0];
            var start = threeStart;
            while(--l>0) rollingKey = (rollingKey<<2) | firstBlock[start++];
            var dict = new Dictionary<long,Wrapper>();
            for(int i=start; i<firstBlock.Length; i++)
                check(dict, ref rollingKey, firstBlock[i], mask);

            int lastBlockId = threeBlocks.Count-1; 
            for(int bl=1; bl<lastBlockId; bl++)
            {
                var bytes = threeBlocks[bl];
                for(int i=0; i<bytes.Length; i++)
                    check(dict, ref rollingKey, bytes[i], mask);
            }

            var lastBlock = threeBlocks[lastBlockId];
            for(int i=0; i<threeEnd; i++)
                check(dict, ref rollingKey, lastBlock[i], mask);
            return summary(dict);
        });
    }

    static string writeFrequencies(Dictionary<long,Wrapper> freq, int fragmentLength)
    {
        var sb = new StringBuilder();
        double percent = 100.0 / freq.Values.Sum(i => i.v);
        foreach(var kv in freq.OrderByDescending(i => i.Value.v))
        {
            var keyChars = new char[fragmentLength];
            var key = kv.Key;
            for (int i=keyChars.Length-1; i>=0; --i)
            {
                keyChars[i] = tochar[key & 0x3];
                key >>= 2;
            }
            sb.Append(keyChars);   
            sb.Append(" ");
            sb.AppendLine((kv.Value.v * percent).ToString("F3"));
        }
        return sb.ToString();
    }

    static string writeCount(Dictionary<long,Wrapper> dictionary, string fragment)
    {
        long key = 0;
        for (int i=0; i<fragment.Length; ++i)
            key = (key << 2) | tonum[fragment[i]];
        Wrapper w;
        var n = dictionary.TryGetValue(key, out w) ? w.v : 0;
        return string.Concat(n.ToString(), "\t", fragment);
    }

    public static void Main(string[] args)
    {
        tonum['c'] = 1; tonum['C'] = 1;
        tonum['g'] = 2; tonum['G'] = 2;
        tonum['t'] = 3; tonum['T'] = 3;
        tonum['\n'] = 255; tonum['>'] = 255; tonum[255] = 255;

        loadThreeData();

        Parallel.ForEach(threeBlocks, bytes =>
        {
            for(int i=0; i<bytes.Length; i++)
                bytes[i] = tonum[bytes[i]];
        });

        var task12 = count(12, 8388607, d => writeCount(d, "GGTATTTTAATT"));
        var task18 = count(18, 34359738367, d => writeCount(d, "GGTATTTTAATTTATAGT"));
        var task6 = count(6, 0b1111111111, d => writeCount(d, "GGTATT"));
        var task1 = count(1, 0, d => writeFrequencies(d, 1));
        var task2 = count(2, 0b11, d => writeFrequencies(d, 2));
        var task3 = count(3, 0b1111, d => writeCount(d, "GGT"));
        var task4 = count(4, 0b111111, d => writeCount(d, "GGTA"));
        
        Console.Out.WriteLineAsync(task1.Result);
        Console.Out.WriteLineAsync(task2.Result);
        Console.Out.WriteLineAsync(task3.Result);
        Console.Out.WriteLineAsync(task4.Result);
        Console.Out.WriteLineAsync(task6.Result);
        Console.Out.WriteLineAsync(task12.Result);
        Console.WriteLine(task18.Result);
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
2.0.2 a04b4bf512
"System.GC.Server": true


Fri, 27 Oct 2017 00:31:33 GMT

MAKE:
cp knucleotide.csharpcore-8.csharpcore Program.cs
cp Include/csharpcore/tmp.csproj .
cp Include/csharpcore/runtimeconfig.template.json .
mkdir obj
cp Include/csharpcore/tmp.csproj.nuget.g.props ./obj
cp Include/csharpcore/tmp.csproj.nuget.g.targets ./obj
/usr/bin/dotnet build -c Release
Microsoft (R) Build Engine version 15.4.8.50001 for .NET Core
Copyright (C) Microsoft Corporation. All rights reserved.

  tmp -> /home/dunham/benchmarksgame_quadcore/knucleotide/tmp/bin/Release/netcoreapp2.0/tmp.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:03.75

6.30s to complete and log all make actions

COMMAND LINE:
/usr/bin/dotnet ./bin/Release/netcoreapp2.0/tmp.dll 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT