DNA processing

DNA processing

Problem 5-5.

Generate the k-mer Composition of a String

Given a string Text, its k-mer composition Compositionk(Text) is the collection of all k-mer substrings of Text (including repeated k-mers). For example,

Composition3(TATGGGGTGC) = {ATG, GGG, GGG, GGT, GTG, TAT, TGC, TGG}

Note that we have listed k-mers in lexicographic order (i.e., how they would appear in a dictionary) rather than in the order of their appearance in TATGGGGTGC. We have done this because the correct ordering of the reads is unknown when they are generated.

String Composition Problem

Generate the k-mer composition of a string.

Given: An integer k and a string Text.

Return: Compositionk(Text) (the k-mers can be provided in any order).

Sample Dataset

5

CAATCCAAC

Sample Output

AATCC

ATCCA

CAATC

CCAAC

TCCAA 

Problem 5-6.

Reconstruct a String from its Genome

Find the string spelled by a genome path.

Given: A sequence of k-mers Pattern1, … , Patternn such that the last k – 1 symbols of Patterni are equal to the first k – 1 symbols of Patterni+1 for i from 1 to n-1.

Return: A string Text of length k+n-1 where the i-th k-mer in Text is equal to Patterni for all i.

Sample Dataset

ACCGA

CCGAA

CGAAG

GAAGC

AAGCT

Sample Output

ACCGAAGCT 

Problem 5-7.

Reconstruct a String from its k-mer Composition

Given: An integer k followed by a list of k-mers Patterns.

Return: A string Text with k-mer composition equal to Patterns. (If multiple answers exist, you may return any one.)

Sample Dataset

4

CTTA

ACCA

TACC

GGCT

GCTT

TTAC

Sample Output

GGCTTACCA 

Solution 

Program5_5.java

import java.util.List;

import java.util.ArrayList;

import java.util.Collections;

public class Program5_5 {

/**

* @return the k-mers in any order.

*/

public static List<String> composition(final int k, final String text) {

final List<String> kmers = new ArrayList<>();

for (int i = 0; i + k <= text.length(); i++) {

final String kmer = text.substring(i, i + k);

kmers.add(kmer);

}

Collections.sort(kmers);

return kmers;

}

// test program

public static void main(String[] args) {

final List<String> kmers = composition(5, “CAATCCAAC”);

for (final String kmer : kmers) {

System.out.println(kmer);

}

}

} 

Program5_6.java 

import java.util.List;

import java.util.Arrays;

public class Program5_6 {

/**

* @return the string spelled by the genome path.

*/

public static String construct(final List<String> path) {

final StringBuilder result = new StringBuilder();

if (!path.isEmpty()) {

result.append(path.get(0));

for (int i = 1; i < path.size(); i++) {

final String node = path.get(i);

result.append(node.charAt(node.length() – 1));

}

}

return result.toString();

}

// test program

public static void main(String[] args) {

final List<String> path = Arrays.asList(“ACCGA”, “CCGAA”, “CGAAG”, “GAAGC”, “AAGCT”);

System.out.println(construct(path));

}

} 

Program5_7.java

import java.util.List;

import java.util.Arrays;

import java.util.Set;

import java.util.HashSet;

public class Program5_7 {

/**

* Reconstruct a string from its k-mer composition.

*/

public static String construct(final int k, final List<String> kmers) {

if (!kmers.isEmpty()) {

for (int i = 0; i < kmers.size(); i++) {

// starting from each kmer as the first part

// check if we can traverse all kmers as a path.

final StringBuilder result = new StringBuilder(kmers.get(i));

// construct the index list for unvisited kmers

final Set<Integer> unvisited = new HashSet<>();

for  (int j = 0; j < kmers.size(); j++) {

if (j != i) {

unvisited.add(j);

}

}

if (traverse(k, kmers, unvisited, result)) {

return result.toString();

}

}

}

return “”;

}

// a recursive function to check if we can reach all kmers.

private static boolean traverse(final int k, final List<String> kmers, final Set<Integer> unvisited, final StringBuilder builder) {

if (unvisited.isEmpty()) {

return true;

}

final String prefix = builder.substring(builder.length() – k + 1);

final Set<Integer> remaining = new HashSet<>();

for (final int next : unvisited) {

final String kmer = kmers.get(next);

if (kmer.startsWith(prefix)) {

// try to use this one as the next one on the path

builder.append(kmer.charAt(k – 1));

remaining.addAll(unvisited);

remaining.remove(next);

// continue trying with remaining kmers

if (traverse(k, kmers, remaining, builder)) {

return true;

}

// recover the temporary path for next trial

remaining.clear();

builder.delete(builder.length() – 1, builder.length());

}

}

return false;

}

// test program

public static void main(String[] args) {

final List<String> kmers = Arrays.asList(

“CTTA”, “ACCA”, “TACC”, “GGCT”, “GCTT”, “TTAC”);

System.out.println(construct(4, kmers));

}

}