How do I process files with matching pattern in nextflow? - nextflow

Suppose I have nextflow channels:
Channel.fromFilePairs( "test/read*_R{1,2}.fa" )
.set{ reads }
reads.view()
Channel.fromPath(['test/lib_R1.fa','test/lib_R2.fa'] )
.set{ libs }
libs.view()
Which results in:
// reads channel
[read_b, [<path>/test/read_b_R1.fa, <path>/test/read_b_R2.fa]]
[read_a, [<path>/test/read_a_R1.fa, <path>/test/read_a_R2.fa]]
// libs channel
<path>/test/lib_R1.fa
<path>/test/lib_R2.fa
How do I run a process foo that executes matching read-lib pair, where the same lib is used for all read pairs? So basically I want to execute foo 4 times:
foo(test/read_b_R1.fa, test/lib_R1.fa)
foo(test/read_b_R2.fa, test/lib_R2.fa)
foo(test/read_a_R1.fa, test/lib_R1.fa)
foo(test/read_a_R2.fa, test/lib_R2.fa)

If you want to use the same library for all read pairs, what you really want is a value channel which can be read an unlimited number of times without being consumed. Note that a value channel is implicitly created by a process when it's invoked with a simple value. This could indeed be a list of files, but it looks like what you want is just one of these to correspond to each of the R1 or R2 reads. I think the simplest solution here is to just include your process using an alias so that you can pass in the required channels/files without too much effort:
params.reads = 'test/read*_R{1,2}.fa'
include { foo as foo_r1 } from './modules/foo.nf'
include { foo as foo_r2 } from './modules/foo.nf'
workflow {
Channel
.fromFilePairs( params.reads )
.multiMap { sample, reads ->
def (r1, r2) = reads
read1:
tuple(sample, r1)
read2:
tuple(sample, r2)
}
.set { reads }
lib_r1 = file('test/lib_R1.fa')
lib_r2 = file('test/lib_R2.fa')
foo_r1(reads.read1, lib_r1)
foo_r2(reads.read2, lib_r2)
}
Contents of ./modules/foo.nf:
process foo {
debug true
input:
tuple val(sample), path(fasta)
path(lib)
"""
echo $sample, $fasta, $lib
"""
}
Results:
$ nextflow run main.nf
N E X T F L O W ~ version 22.10.0
Launching `main.nf` [confident_boyd] DSL2 - revision: 8c81e2d743
executor > local (6)
[a8/e8a752] process > foo_r1 (2) [100%] 3 of 3 ✔
[75/2b32f5] process > foo_r2 (3) [100%] 3 of 3 ✔
readC, readC_R2.fa, lib_R2.fa
readA, readA_R1.fa, lib_R1.fa
readC, readC_R1.fa, lib_R1.fa
readB, readB_R2.fa, lib_R2.fa
readA, readA_R2.fa, lib_R2.fa
readB, readB_R1.fa, lib_R1.fa

process FOO {
debug true
input:
tuple val(files), path(lib)
output:
stdout
script:
file_a = files[0]
file_b = files[1]
"""
echo $file_a with $lib
echo $file_b with $lib
"""
}
workflow {
Channel
.of(['read_b', [file('/test/read_b_R1.fa'), file('/test/read_b_R2.fa')]],
['read_a', [file('/test/read_a_R1.fa'), file('/test/read_a_R2.fa')]]
)
.set { reads }
Channel
.of(file('/test/lib_R1.fa'),
file('/test/lib_R2.fa')
)
.set { libs }
reads
.map { sample, files -> files }
.flatten()
.map { file -> [file.name.split('_')[2].split('.fa')[0], file]}
.groupTuple()
.set { reads }
libs
.map { file -> [file.name.split('_')[1].split('.fa')[0], file]}
.set { libs }
reads
.join(libs)
.map { Rx, path, lib -> [path, lib] }
| FOO
}
The output of the script above is:
N E X T F L O W ~ version 22.10.4
Launching `ex.nf` [elegant_wiles] DSL2 - revision: 00862286fd
executor > local (2)
[58/9b3cf1] process > FOO (2) [100%] 2 of 2 ✔
/test/read_b_R1.fa with lib_R1.fa
/test/read_a_R1.fa with lib_R1.fa
/test/read_b_R2.fa with lib_R2.fa
/test/read_a_R2.fa with lib_R2.fa
EDIT as a reply to the comment below.
If you want the process to run once per element in the channel, check the modified version below:
process FOO {
debug true
input:
tuple val(file), path(lib)
output:
stdout
script:
"""
echo $file with $lib
"""
}
workflow {
Channel
.of(['read_b', [file('/test/read_b_R1.fa'), file('/test/read_b_R2.fa')]],
['read_a', [file('/test/read_a_R1.fa'), file('/test/read_a_R2.fa')]]
)
.set { reads }
Channel
.of(file('/test/lib_R1.fa'),
file('/test/lib_R2.fa')
)
.set { libs }
reads
.map { sample, files -> files }
.flatten()
.map { file -> [file.name.split('_')[2].split('.fa')[0], file]}
.groupTuple()
.set { reads }
libs
.map { file -> [file.name.split('_')[1].split('.fa')[0], file]}
.set { libs }
reads
.join(libs)
.map { Rx, path, lib -> [path, lib] }
.map { x, y -> [[x[0], y], [x[1], y]] }
.flatMap()
| FOO
}
Output:
N E X T F L O W ~ version 22.10.4
Launching `ex.nf` [sharp_ekeblad] DSL2 - revision: 1412af632e
executor > local (4)
[a0/416f59] process > FOO (1) [100%] 4 of 4 ✔
/test/read_b_R2.fa with lib_R2.fa
/test/read_a_R2.fa with lib_R2.fa
/test/read_a_R1.fa with lib_R1.fa
/test/read_b_R1.fa with lib_R1.fa

Related

RisingEdge example doesn't work for module input signal in Chisel3

In Chisel documentation we have an example of rising edge detection method defined as following :
def risingedge(x: Bool) = x && !RegNext(x)
All example code is available on my github project blp.
If I use it on an Input signal declared as following :
class RisingEdge extends Module {
val io = IO(new Bundle{
val sclk = Input(Bool())
val redge = Output(Bool())
val fedge = Output(Bool())
})
// seems to not work with icarus + cocotb
def risingedge(x: Bool) = x && !RegNext(x)
def fallingedge(x: Bool) = !x && RegNext(x)
// works with icarus + cocotb
//def risingedge(x: Bool) = x && !RegNext(RegNext(x))
//def fallingedge(x: Bool) = !x && RegNext(RegNext(x))
io.redge := risingedge(io.sclk)
io.fedge := fallingedge(io.sclk)
}
With this icarus/cocotb testbench :
class RisingEdge(object):
def __init__(self, dut, clock):
self._dut = dut
self._clock_thread = cocotb.fork(clock.start())
#cocotb.coroutine
def reset(self):
short_per = Timer(100, units="ns")
self._dut.reset <= 1
self._dut.io_sclk <= 0
yield short_per
self._dut.reset <= 0
yield short_per
#cocotb.test()
def test_rising_edge(dut):
dut._log.info("Launching RisingEdge test")
redge = RisingEdge(dut, Clock(dut.clock, 1, "ns"))
yield redge.reset()
cwait = Timer(10, "ns")
for i in range(100):
dut.io_sclk <= 1
yield cwait
dut.io_sclk <= 0
yield cwait
I will never get rising pulses on io.redge and io.fedge. To get the pulse I have to change the definition of risingedge as following :
def risingedge(x: Bool) = x && !RegNext(RegNext(x))
With dual RegNext() :
With simple RegNext() :
Is it a normal behavior ?
[Edit: I modified source example with the github example given above]
I'm not sure about Icarus, but using the default Treadle simulator for a test like this.
class RisingEdgeTest extends FreeSpec {
"debug should toggle" in {
iotesters.Driver.execute(Array("-tiwv"), () => new SlaveSpi) { c =>
new PeekPokeTester(c) {
for (i <- 0 until 10) {
poke(c.io.csn, i % 2)
println(s"debug is ${peek(c.io.debug)}")
step(1)
}
}
}
}
}
I see the output
[info] [0.002] debug is 0
[info] [0.002] debug is 1
[info] [0.002] debug is 0
[info] [0.003] debug is 1
[info] [0.003] debug is 0
[info] [0.003] debug is 1
[info] [0.004] debug is 0
[info] [0.004] debug is 1
[info] [0.005] debug is 0
[info] [0.005] debug is 1
And the wave form looks like
Can you explain what you think this should look like.
Do not change module input value on rising edge of clock.
Ok I found my bug. In the cocotb testbench I toggled input values on the same edge of synchronous clock. If we do that, the input is modified exactly under the setup time of D-Latch, then the behavior is undefined !
Then, the problem was a cocotb testbench bug and not Chisel bug. To solve it we just have to change the clock edge for toggling values like it :
#cocotb.test()
def test_rising_edge(dut):
dut._log.info("Launching RisingEdge test")
redge = RisingEdge(dut, Clock(dut.clock, 1, "ns"))
yield redge.reset()
cwait = Timer(4, "ns")
yield FallingEdge(dut.clock) # <--- 'synchronize' on falling edge
for i in range(5):
dut.io_sclk <= 1
yield cwait
dut.io_sclk <= 0
yield cwait

Reading file line by line in Perl6, how to do idiomatically?

I have a rudimentary script in Perl6 which runs very slowly, about 30x slower than the exact perl5 translation.
CONTROL {
when CX::Warn {
note $_;
exit 1;
}
}
use fatal;
role KeyRequired {
method AT-KEY (\key) {
die "Key {key} not found" unless self.EXISTS-KEY(key);
nextsame;
}
}
for dir(test => /^nucleotide_\d**2_\d**2..3\.tsv$/) -> $tsv {
say $tsv;
my $qqman = $tsv.subst(/\.tsv$/, '.qqman.tsv');
my $out = open $qqman, :w;
put "\t$qqman";
my UInt $line-no = 0;
for $tsv.lines -> $line {
if $line-no == 0 {
$line-no = 1;
$out.put(['SNP', 'CHR', 'BP', 'P', 'zscore'].join("\t"));
next
}
if $line ~~ /.+X/ {
next
}
$line-no++;
my #line = $line.split(/\s+/);
my $chr = #line[0];
my $nuc = #line[1];
my $p = #line[3];
my $zscore = #line[2];
my $snp = "'rs$line-no'";
$out.put([$snp, $chr, $nuc, $p, $zscore].join("\t"));
#$out.put();
}
last
}
this is idiomatic in Perl5's while.
This is a very simple script, which only alters columns of text in a file. This Perl6 script runs in 30 minutes. The Perl5 translation runs in 1 minute.
I've tried reading Using Perl6 to process a large text file, and it's Too Slow.(2014-09) and Perl6 : What is the best way for dealing with very big files? but I'm not seeing anything that could help me here :(
I'm running Rakudo version 2018.03 built on MoarVM version 2018.03
implementing Perl 6.c.
I realize that Rakudo hasn't matured to Perl5's level (yet, I hope), but how can I get this to read the file line by line in a more reasonable time frame?
There is a bunch of things I would change.
/.+X/ can be simplified to just /.X/ or even $line.substr(1).contains('X')
$line.split(/\s+/) can be simplified to $line.words
$tsv.subst(/\.tsv$/, '.qqman.tsv') can be simplified to $tsv.substr(*-4) ~ '.qqman.tsv'
uint instead of UInt
given .head {} instead of for … {last}
given dir(test => /^nucleotide_\d**2_\d**2..3\.tsv$/).head -> $tsv {
say $tsv;
my $qqman = $tsv.substr(*-4) ~ '.qqman.tsv';
my $out = open $qqman, :w;
put "\t$qqman";
my uint $line-no = 0;
for $tsv.lines -> $line {
FIRST {
$line-no = 1;
$out.put(('SNP', 'CHR', 'BP', 'P', 'zscore').join("\t"));
next
}
next if $line.substr(1).contains('X');
++$line-no;
my ($chr,$nuc,$zscore,$p) = $line.words;
my $snp = "'rs$line-no'";
$out.put(($snp, $chr, $nuc, $p, $zscore).join("\t"));
#$out.put();
}
}

"Mix" operator does not wait for upstream processes to finish

I have several upstream processes, say A, B and C, doing similar tasks.
Downstream of that, I have one process X that needs to treat all outputs of the A, B and C in the same way.
I tried to use the "mix" operator to create a single channel from the output files of A, B and C like so :
process A {
output:
file outA
}
process B {
output:
file outB
}
process C {
output:
file outC
}
inX = outA.mix(outB,outC)
process X {
input:
file inX
"myscript.sh"
}
Process A often finishes before B and C, and somehow, process X does not wait for process B and C to finish, and only take the outputs of A as input.
The following snippet works nicely:
process A {
output:
file outA
"""
touch outA
"""
}
process B {
output:
file outB
"""
touch outB
"""
}
process C {
output:
file outC
"""
touch outC
"""
}
inX = outA.mix(outB,outC)
process X {
input:
file inX
"echo myscript.sh"
}
If you continue to experience the same problem feel free to open an issue including a reproducible test case.

How can I implement coroutines for a parallel task

So, I have this piece of code:
for (z in 0 until texture.extent.z) {
println(z)
for (y in 0 until texture.extent.y)
for (x in 0 until texture.extent.x) {
val v = Vec3(x, y, z) / texture.extent
var n = when {
FRACTAL -> FractalNoise().noise(v * noiseScale)
else -> 20f * glm.perlin(v)
}
n -= glm.floor(n)
data[x + y * texture.extent.x + z * texture.extent.x * texture.extent.y] = glm.floor(n * 255).b
}
}
That takes over 4m on the jvm. The original sample in cpp uses OpenMp to accelerate the calculation.
I've heard about coroutines and I hope I could take advantage of them in this case.
I tried first to wrap the whole fors into a runBlocking because I do want that all the coroutines have finished before I move on.
runBlocking {
for (z in 0 until texture.extent.z) {
println(z)
for (y in 0 until texture.extent.y)
for (x in 0 until texture.extent.x) {
launch {
val v = Vec3(x, y, z) / texture.extent
var n = when {
FRACTAL -> FractalNoise().noise(v * noiseScale)
else -> 20f * glm.perlin(v)
}
n -= glm.floor(n)
data[x + y * texture.extent.x + z * texture.extent.x * texture.extent.y] = glm.floor(n * 255).b
}
}
}
}
But this is throwing different thread errors plus a final jvm crash
[thread 27624 also had an error][thread 23784 also had an error]# A fatal error has been detected by the Java Runtime Environment:
#
[thread 27624 also had an error][thread 23784 also had an error]# A fatal error has been detected by the Java Runtime Environment:
#
# [thread 14004 also had an error]EXCEPTION_ACCESS_VIOLATION
[thread 32652 also had an error] (0xc0000005)[thread 32616 also had an error]
at pc=0x0000000002d2fd50
, pid=23452[thread 21264 also had an error], tid=0x0000000000007b68
#
# JRE version: Java(TM) SE Runtime Environment (8.0_144-b01) (build 1.8.0_144-b01)
# Java VM: Java HotSpot(TM) 64-Bit Server VM (25.144-b01 mixed mode windows-amd64 compressed oops)
# Problematic frame:
# J 1431 C2 java.util.concurrent.ForkJoinPool$WorkQueue.runTask(Ljava/util/concurrent/ForkJoinTask;)V (86 bytes) # 0x0000000002d2fd50 [0x0000000002d2f100+0xc50]
#
# Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
#
# An error report file with more information is saved as:
# C:\Users\gBarbieri\IdeaProjects\Vulkan\hs_err_pid23452.log
#
# If you would like to submit a bug report, please visit:
# http://bugreport.java.com/bugreport/crash.jsp
#
Process finished with exit code 1
I tried also to collect all the jobs into an arrayList and join() them at the end, but without success..
May coroutine be used for a parallel task like this one?
If yes, what am I doing wrong?
Instead of coroutines you should consider the parallel computation engine built into the JDK: java.util.stream. What you have here is an embarrassingly parallelizable task, a perfect use case for it.
I'd use something along these lines:
IntStream.range(0, extent.x)
.boxed()
.parallel()
.flatMap { x ->
IntStream.range(0, extent.y).boxed().flatMap { y ->
IntStream.range(0, extent.z).mapToObj { z ->
Vec(x, y, z)
}
}
}
.forEach { vec ->
data[vecToArrayIndex(vec)] = computeValue(vec)
}

How to fork/clone a process in Erlang

How to fork/clone a process in Erlang, as the fork in Unix?
I have searched a lot but just got nothing.
Maybe the usage looks like this:
case fork() of
{parent, Pid} ->
in_parent_process_now();
{child, Pid} ->
in_child_process_now();
{error, Msg} ->
report_fork_error(Msg)
end.
Any ideas?
EDIT:
In order to explain my point better, take the following C code as an example:
f();
fork();
g();
Here the return value of fork() is ignored, so the next steps of both the parent process and the child process are the same, which is to execute g().
Can I achieve this in Erlang?
(This question was also answered in the erlang-questions mailing list.)
Erlang does not have a 'fork' operation. It has a spawn operation however:
parent_process() ->
will_be_executed_by_parent_process(),
spawn(fun() -> will_be_executed_by_child_process() end),
will_also_be_executed_by_parent_process().
... where function names show in what context they will be executed. Note that any data passed to the child process will be copied to the new process' heap.
As you know, there is generic pattern to implement processes in erlang:
loop( State ) ->
receive
Message ->
NewState = process( Message, State ),
loop( NewState )
end.
In each quant of time process has a State. So if you want to "fork" some process from current - you have to pass specific message for it. Process have to recognize that message and spawn the new process with copy of its current state in spawned process.
I've created example, to illustrate text above:
-module( test ).
-export( [ fork/1, get_state/1, change_state/2 ] ).
-export( [ loop/1 ] ).
loop( State ) ->
receive
{ fork, Sender } ->
%%
%% if you want to link with child process
%% call spawn_link instead of spawn
%%
ClonePid = spawn( ?MODULE, loop, [ State ] ),
responseTo( Sender, ClonePid ),
loop( State );
{ get_state, Sender } ->
responseTo( Sender, { curr_state, State } ),
loop( State );
{ change_state, Data, Sender } ->
{ Response, NewState } = processData( Data, State ),
responseTo( Sender, Response ),
loop( NewState )
end.
fork( Pid ) ->
Ref = make_ref(),
Pid ! { fork, { Ref, self() } },
get_response( Ref ).
get_state( Pid ) ->
Ref = make_ref(),
Pid ! { get_state, { Ref, self() } },
get_response( Ref ).
change_state( Pid, Data ) ->
Ref = make_ref(),
Pid ! { change_state, Data, { Ref, self() } },
get_response( Ref ).
get_response( Ref ) ->
receive
{ Ref, Message } -> Message
end.
responseTo( { Ref, Pid }, Mes ) ->
Pid ! { Ref, Mes }.
processData( Data, State ) ->
%%
%% here comes logic of processing data
%% and changing process state
%%
NewState = Data,
Response = { { old_state, State }, { new_state, NewState } },
{ Response, NewState }.
Lets test it in erlang shell:
1> c(test).
{ok,test}
Creating parent process with initial state first_state
2> ParentPid = spawn( test, loop, [ first_state ] ).
<0.38.0>
3> test:get_state( ParentPid ).
{curr_state,first_state}
4>
Lets change state of parent process to second_state:
4> test:change_state( ParentPid, second_state ).
{{old_state,first_state},{new_state,second_state}}
Fork new process from parent process:
5> ChildPid = test:fork( ParentPid ).
<0.42.0>
Check state of forked process (it is the same as in parent process):
6> test:get_state( ChildPid ).
{curr_state,second_state}
There is no fork in Erlang. But you can use one among spawn/1, spawn/2, spawn/3, spawn/4 (see also spawn_link) that are BIFs of erlang see erlang module.
So, for example:
-module(mymodule).
-export([parent_fun/0]).
parent_fun() ->
io:format("this is the parent with pid: ~p~n", [self()]),
spawn(fun() -> child_fun() end),
io:format("still in parent process: ~p~n", [self()]).
child_fun() ->
io:format("this is child process with pid: ~p~n", [self()]).
Execute in erlang shell as:
mymodule:parent_fun().
Note that parent process and child process have different pids.
I strongly suggest you to read: http://learnyousomeerlang.com/the-hitchhikers-guide-to-concurrency