diff --git a/.config/masterconfig.yaml b/.config/masterconfig.yaml index d69acfe304b16874e426c176a8e07d071538fdec..df713a879b5327174b25923b33caccc690e65f98 100644 --- a/.config/masterconfig.yaml +++ b/.config/masterconfig.yaml @@ -1,17 +1,35 @@ -# absolute path to your desired output path -root: /output/path +# absolute/relative path to your desired output path +root: . ####################### optional prejob - data preparation ####################### # path to tar data -data: /path +data: test_data # list of tar names -get_all_tar_filename: False -tarIDS: "tar_filename" +get_all_tar_filename: True +tarIDS: [] ####################### job - workflow ####################### ### CONFIG - - +get_all_filenames: True +IDS: ["sd_0001.ccs", "sd_0002.ccs", "sd_0003.ccs"] + +sd_0001.ccs: + run: run001 + ploidy: 2 + busco_lineage: eudicots_odb10 + mode: default + +sd_0002.ccs: + run: run002 + ploidy: 2 + busco_lineage: eudicots_odb10 + mode: default + +sd_0003.ccs: + run: run003 + ploidy: 2 + busco_lineage: eudicots_odb10 + mode: default ####################### workflow output directories ####################### # results directory diff --git a/README.md b/README.md index cc5c293cf2bd05321d5eb3f827961a8c265aa52e..cd355f043017ae3ab9fa359fd883735b54cb6e22 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,41 @@ # <A HREF="https://forgemia.inra.fr/asm4pg/GenomAsm4pg"> asm4pg </A> + An automatic and reproducible genome assembly workflow for pangenomic applications using PacBio HiFi data. This workflow uses [Snakemake](https://snakemake.readthedocs.io/en/stable/) to quickly assemble genomes with a HTML report summarizing obtained assembly stats. -A first script (```prejob.sh```) prepares the data until *fasta.gz* files are obtained. A second script (```job.sh```) runs the genome assembly and stats. +A first script (`prejob.sh`) taking `.tar` file(s) as input aims to convert `.bam` to `.fastq(a).gz` and create `00.raw_data` folder with several subfolders (detailed folder structure is descriped below). This step can be skipped if the user already has fasta(q).gz files that are put in the folders with the same structure. `fastq.gz` is mandatory for raw data QC steps, and (`fasta.gz`) is mandatory if QC is not required. The user must create a single input from multiple hifi runs for a single assembly run using (`job.sh`). + +A second script (`job.sh`) runs the genome assembly and stats. doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg)  ## Table of contents -[TOC] -## Repo directory structure +- [ asm4pg ](#-asm4pg-) + - [Table of contents](#table-of-contents) + - [Repo directory structure](#repo-directory-structure) + - [Requirements](#requirements) + - [Workflow steps, programs \& Docker images pulled by Snakemake](#workflow-steps-programs--docker-images-pulled-by-snakemake) + - [How to run the workflow](#how-to-run-the-workflow) + - [Profile setup](#profile-setup) + - [Workflow execution](#workflow-execution) + - [Running the prejob](#running-the-prejob) + - [Running the main workflow](#running-the-main-workflow) + - [Dry run](#dry-run) + - [Outputs](#outputs) + - [Known problems/errors](#known-problemserrors) + - [HPC](#hpc) + - [BUSCO](#busco) + - [HiFi assembly](#hifi-assembly) + - [Snakemake locked directory](#snakemake-locked-directory) + - [How to cite asm4pg?](#how-to-cite-asm4pg) + - [License](#license) + - [Contacts](#contacts) +## Repo directory structure ``` ├── README.md @@ -40,95 +62,117 @@ doc: [Gitlab pages](https://asm4pg.pages.mia.inra.fr/genomasm4pg) ``` ## Requirements + - snakemake >= 6.5.1 +- slurm +- conda - singularity ## Workflow steps, programs & Docker images pulled by Snakemake + All images here will be pulled automatically by Snakemake the first time you run the workflow. It may take some time. Images are only downloaded once and reused automatically by the workflow. Images are stored on the project's container registry but come from various container libraries: **Pre-assembly** + - Conversion of PacBio bam to fasta & fastq - - **smrtlink** (https://www.pacb.com/support/software-downloads/) - - image version: 9.0.0.92188 ([link](https://hub.docker.com/r/bryce911/smrtlink/tags)) + - **smrtlink** (https://www.pacb.com/support/software-downloads/) + - image version: 9.0.0.92188 ([link](https://hub.docker.com/r/bryce911/smrtlink/tags)) - Fastq to fasta conversion - - **seqtk** (https://github.com/lh3/seqtk) - - image version: 1.3--dc0d16b ([link](https://hub.docker.com/r/nanozoo/seqtk)) + - **seqtk** (https://github.com/lh3/seqtk) + - image version: 1.3--dc0d16b ([link](https://hub.docker.com/r/nanozoo/seqtk)) - Raw data quality control - - **fastqc** (https://github.com/s-andrews/FastQC) - - image version: v0.11.5_cv4 ([link](https://hub.docker.com/r/biocontainers/fastqc/tags)) - - **lonqQC** (https://github.com/yfukasawa/LongQC) - - image version: latest (April 2022) ([link](https://hub.docker.com/r/grpiccoli/longqc/tags)) + - **fastqc** (https://github.com/s-andrews/FastQC) + - image version: v0.11.5_cv4 ([link](https://hub.docker.com/r/biocontainers/fastqc/tags)) + - **lonqQC** (https://github.com/yfukasawa/LongQC) + - image version: latest (April 2022) ([link](https://hub.docker.com/r/grpiccoli/longqc/tags)) - Metrics - - **genometools** (https://github.com/genometools/genometools) - - image version: v1.5.9ds-4-deb_cv1 ([link](https://hub.docker.com/r/biocontainers/genometools/tags)) + - **genometools** (https://github.com/genometools/genometools) + - image version: v1.5.9ds-4-deb_cv1 ([link](https://hub.docker.com/r/biocontainers/genometools/tags)) - K-mer analysis - - **jellyfish** (https://github.com/gmarcais/Jellyfish) - - image version: 2.3.0--h9f5acd7_3 ([link](https://quay.io/repository/biocontainers/kmer-jellyfish?tab=tags)) - - **genomescope** (https://github.com/tbenavi1/genomescope2.0) - - image version: 2.0 ([link](https://hub.docker.com/r/abner12/genomescope)) + - **jellyfish** (https://github.com/gmarcais/Jellyfish) + - image version: 2.3.0--h9f5acd7_3 ([link](https://quay.io/repository/biocontainers/kmer-jellyfish?tab=tags)) + - **genomescope** (https://github.com/tbenavi1/genomescope2.0) + - image version: 2.0 ([link](https://hub.docker.com/r/abner12/genomescope)) **Assembly** + - Assembly - - **hifiasm** (https://github.com/chhylp123/hifiasm) - - image version: 0.16.1--h5b5514e_1 ([link](https://quay.io/repository/biocontainers/hifiasm?tab=tags)) + - **hifiasm** (https://github.com/chhylp123/hifiasm) + - image version: 0.16.1--h5b5514e_1 ([link](https://quay.io/repository/biocontainers/hifiasm?tab=tags)) - Metrics - - **genometools** (same as Pre-assembly) + - **genometools** (same as Pre-assembly) - Assembly quality control - - **busco** (https://gitlab.com/ezlab/busco) - - image version: v5.3.1_cv1 ([link](https://hub.docker.com/r/ezlabgva/busco/tags)) - - **kat** (https://github.com/TGAC/KAT) - - image version: 2.4.1--py35h355e19c_3 ([link](https://quay.io/repository/biocontainers/kat)) + - **busco** (https://gitlab.com/ezlab/busco) + - image version: v5.3.1_cv1 ([link](https://hub.docker.com/r/ezlabgva/busco/tags)) + - **kat** (https://github.com/TGAC/KAT) + - image version: 2.4.1--py35h355e19c_3 ([link](https://quay.io/repository/biocontainers/kat)) - Error rate, QV & phasing - - **meryl** and **merqury** (https://github.com/marbl/meryl, https://github.com/marbl/merqury) - - image version: 1.3--hdfd78af_0 ([link](https://quay.io/repository/biocontainers/merqury?tab=tags)) + - **meryl** and **merqury** (https://github.com/marbl/meryl, https://github.com/marbl/merqury) + - image version: 1.3--hdfd78af_0 ([link](https://quay.io/repository/biocontainers/merqury?tab=tags)) - Detect assembled telomeres - - **FindTelomeres** (https://github.com/JanaSperschneider/FindTelomeres) - - **Biopython** image version: 1.75 ([link](https://quay.io/repository/biocontainers/biopython?tab=tags)) -- Haplotigs and overlaps purging - - **purge_dups** (https://github.com/dfguan/purge_dups) - - image version: 1.2.5--h7132678_2 ([link](https://quay.io/repository/biocontainers/purge_dups?tab=tags)) - - **matplotlib** image version: v0.11.5-5-deb-py3_cv1 ([link](https://hub.docker.com/r/biocontainers/matplotlib-venn/tags)) + - **FindTelomeres** (https://github.com/JanaSperschneider/FindTelomeres) + - **Biopython** image version: 1.75 ([link](https://quay.io/repository/biocontainers/biopython?tab=tags)) +- Haplotigs and overlaps purging + - **purge_dups** (https://github.com/dfguan/purge_dups) + - image version: 1.2.5--h7132678_2 ([link](https://quay.io/repository/biocontainers/purge_dups?tab=tags)) + - **matplotlib** image version: v0.11.5-5-deb-py3_cv1 ([link](https://hub.docker.com/r/biocontainers/matplotlib-venn/tags)) **Report** + - **R markdown** - - image version: 4.0.3 ([link](https://hub.docker.com/r/reslp/rmarkdown/tags)) + - image version: 4.0.3 ([link](https://hub.docker.com/r/reslp/rmarkdown/tags)) ## How to run the workflow ### Profile setup + The current profile is made for SLURM. To run this workflow on another HPC, create another profile (https://github.com/Snakemake-Profiles) and add it in the `.config/snakemake_profile` directory. Change the `CLUSTER_CONFIG` and `PROFILE` variables in `job.sh` and `prejob.sh`. If you are using the current SLURM setup, change line 13 to your email adress in the `cluster_config`.yml file. -### SLURM logs -SLURM submission scripts, prejob.sh and job.sh, output standard and error output into slurm_logs directory. This directory must exist before running any of these submission script else slurm will refuse to submit these jobs. +## Workflow execution + +Navigate into the `GenomAsm4pg` directory to run the bash scripts. + +## Running the prejob + +Create a test_data folder to hold the test data that will be used to run the pipeline. ``` -# create if not exist -mkdir -p slurm_logs +$ mkdir -p test_data ``` -### Workflow execution -Go in the `Assemb_v2_Snakemake_FullAuto` directory to run the bash scripts. +Download the test data from `raw.github...` and place it into the `test_data` folder -1. **Data preparation** +Modify the following variables in the following files: + +`.config/masterconfig.yaml`: -Modify the following variables in file `.config/masterconfig.yaml`: - `root` - - The absolute path where you want the output to be. + - The path where you want the output to be. This can be relative or absolute + - Set this to be the repository folder, `.`. - `data` - - The path to the directory containing all input tar files. -This workflow can automatically determine the name of files in the specified `data` directory, or run only on given files : -- `get_all_tar_filename: True` will uncompress all tar files. If you want to choose the the files to uncompress, use `get_all_tar_filename: False` and give the filenames as a list in `tarIDS` + - The path where you want the input data (`.tar`) to be. + - Set this to `test_data`. + - Alternatively, you have the option of running only on user-specified files: + - Setting `get_all_tar_filename: True`, will uncompress all tar files. + - If you want to choose the files to uncompress, set `get_all_tar_filename: False` and type out the filenames as a list in `tarIDS` + +`./prejob.sh`: +- Line 17, `#SBATCH --mail-user=` + - Set this to be your email adress. +- `Module Loading:` + - If Singularity is not in the HPC environement, add `module load singularity` under Module loading. + +Once these variables have been set, run the following: -Modify the `SNG_BIND` variable in `prejob.sh`, it has to be the same as the variable `root` in `.config/masterconfig.yaml`. Change line 17 to your email adress. -If Singularity is not in the HPC environement, add `module load singularity` under Module loading. -Then run ```bash sbatch prejob.sh ``` -This will create multiple directories to prepare the data for the workflow. You will end up with a `bam_files` directory containing all *bam* files, renamed as the tar filename if your data was named "ccs.bam", and a `fastx_files` directory containing all *fasta* and *fastq* files. The `extract` directory contains all other files that were in the tar ball. + +This will create multiple directories to prepare the data for the workflow. You will end up with a `bam_files` directory containing all _bam_ files, renamed as the tar filename if your data was named "ccs.bam", and a `fastx_files` directory containing all _fasta_ and _fastq_ files. The `extract` directory contains all other files that were in the tar ball. + ``` workflow_results └── 00_raw_data @@ -137,23 +181,27 @@ workflow_results └── fastx_files ``` -2. **Running the workflow** +## Running the main workflow + +The `fastx_files` directory will be the starting point for the assembly workflow. You can add other datasets but the workflow needs a _fasta.gz_ file. If _bam_ files or _fastq.gz_ files are available, the workflow runs raw data quality control steps. + +You will have to modify other variables in `.config/masterconfig.yaml`: -The `fastx_files` directory will be the starting point for the assembly workflow. You can add other datasets but the workflow needs a *fasta.gz* file. If *bam* files or *fastq.gz* files are available, the workflow runs raw data quality control steps. +- Setting `get_all_filenames: True` will take all of the `.fasta.gz` files in the `fastx_files` directory and set them as a list in `IDS`. +- Alternatively, give the fasta filenames as a list in `IDS` to specify files you want to run the pipeline on. + +Your config should also follow this template: -You will have to modify other variables in file `.config/masterconfig.yaml`: -- Give the fasta filenames as a list in `IDS`. -- Your config should follow this template ```yaml # default assembly mode -sample_1: +sample_1_file_name: run: name ploidy: 2 busco_lineage: eudicots_odb10 mode: default # trio assembly mode -sample_2: +sample_2_file_name: run: name ploidy: 2 busco_lineage: eudicots_odb10 @@ -162,7 +210,7 @@ sample_2: p2: path/to/parent/2/reads # hi-c assembly mode -sample_3: +sample_3_file_name: run: name ploidy: 2 busco_lineage: eudicots_odb10 @@ -170,35 +218,48 @@ sample_3: r1: path/to/r1/reads r2: path/to/r2/reads ``` -- Choose your run name with `run`. + +- Make sure to set `Sample_1_file_name` to match the file names in the `fastx_files` directory. An example can be seen in the `masterconfig.yaml` file which is configured to run on the provided test data. +- Choose your run name by setting `run`. - Specify the organism ploidy with `ploidy`. - Choose the BUSCO lineage with `lineage`. - There are 3 modes to run hifiasm. In all cases, the organism has to be sequenced in PacBio HiFi. To choose the mode, modify the variable `mode` to either : - - `default` for a HiFi-only assembly. - - `trio` if you have parental reads (either HiFi or short reads) in addition to the sequencing of the organism. - - Add a key corresponding to your filename and modify the variables `p1` and `p2` to be the parental reads. Supported filetypes are *fasta*, *fasta.gz*, *fastq* and *fastq.gz*. - - `hi-c` if the organism has been sequenced in paired-end Hi-C as well. - - Add a key corresponding to your filename an modify the variables `r1` and `r2` to be the paired-end Hi-C reads. Supported filetypes are *fasta*, *fasta.gz*, *fastq* and *fastq.gz*. - -Modify the `SNG_BIND` variable in `job.sh`, it has to be the same as the variable `root` in `.config/masterconfig.yaml`. Change line 17 to your email adress. -If Singularity is not in the HPC environement, add `module load singularity` under Module loading. -Then run + - `default` for a HiFi-only assembly. + - `trio` if you have parental reads (either HiFi or short reads) in addition to the sequencing of the organism. + - Add a key corresponding to your filename and modify the variables `p1` and `p2` to be the parental reads. Supported filetypes are _fasta_, _fasta.gz_, _fastq_ and _fastq.gz_. + - `hi-c` if the organism has been sequenced in paired-end Hi-C as well. + - Add a key corresponding to your filename an modify the variables `r1` and `r2` to be the paired-end Hi-C reads. Supported filetypes are _fasta_, _fasta.gz_, _fastq_ and _fastq.gz_. + +Modify the following variables in `./job.sh`: + +- Line 17, `#SBATCH --mail-user=` + - Set this to be your email adress. +- `Module Loading` + - If Singularity is not in the HPC environement, add `module load singularity` under Module loading. + +Once these variables have been set, run the following: + ```bash sbatch job.sh ``` -All the slurm output logs are in the `slurm_logs` directory. There are .out and .err files for the worklow (*snakemake.cortex**) and for each rules (*rulename.cortex**). +All the slurm output logs are in the `slurm_logs` directory. There are .out and .err files for the worklow (*snakemake.cortex\*\*) and for each rules (*rulename.cortex\*\*). ### Dry run + To check if the workflow will run fine, you can do a dry run: uncomment line 56 in `job.sh` and comment line 59, then run + ```bash sbatch job.sh ``` -Check the snakemake.cortex*.out file in the `slurm_logs` directory, you should see a summary of the workflow. + +Check the snakemake.cortex\*.out file in the `slurm_logs` directory, you should see a summary of the workflow. ### Outputs + These are the directories for the data produced by the workflow: -- An automatic report is generated in the `RUN` directory. + +- An automatic report is generated in each `RUN` directory. - `01_raw_data_QC` contains all quality control ran on the reads. FastQC and LongQC create html reports on fastq and bam files respectively, reads stats are given by Genometools, and predictions of genome size and heterozygosity are given by Genomescope (in directory `04_kmer`). - `02_genome_assembly` contains 2 assemblies. The first one is in `01_raw_assembly`, it is the assembly obtained with hifiasm. The second one is in `02_after_purge_dups_assembly`, it is the hifiasm assembly after haplotigs removal by purge_dups. Both assemblies have a `01_assembly_QC` directory containing assembly statistics done by Genometools (in directory `assembly_stats`), BUSCO analyses (`busco`), k-mer profiles with KAT (`katplot`) and completedness and QV stats with Merqury (`merqury`) as well as assembled telomeres with FindTelomeres (`telomeres`). @@ -235,21 +296,32 @@ workflow_results ``` ## Known problems/errors + ### HPC + The workflow does not work if the HPC does not allow a job to run other jobs. + ### BUSCO + The first time you run the workflow, if there are multiple samples, the BUSCO lineage might be downladed multiple times. This can create a conflict between the jobs using BUSCO and may interrupt some of them. In that case, you only need to rerun the workflow once everything is done. + +### HiFi assembly + +If your pipeline fails at the hifiasm step, this may be a result of improper input data being provided. Please make sure that there are no 'N' or undefined bases in your raw data. + ### Snakemake locked directory + When you try to rerun the workflow after cancelling a job, you may have to unlock the results directory. To do so, go in `.config/snakemake_profile/slurm` and uncomment line 14 of `config.yaml`. Run the workflow once to unlock the directory (it should only take a few seconds). Still in `config.yaml`, comment line 14. The workflow will be able to run and create outputs. -## How to cite asm4pg? ## +## How to cite asm4pg? + +We are currently writing a publication about asm4pg. Meanwhile, if you use the pipeline, please cite it using the address of this repository. -We are currently writing a publication about asm4pg. Meanwhile, if you use the pipeline, please cite it using the address of this repository. +## License -## License ## +The content of this repository is licensed under <A HREF="https://choosealicense.com/licenses/gpl-3.0/">(GNU GPLv3)</A> -The content of this repository is licensed under <A HREF="https://choosealicense.com/licenses/gpl-3.0/">(GNU GPLv3)</A> +## Contacts -## Contacts ## For any troubleshouting, issue or feature suggestion, please use the issue tab of this repository. For any other question or if you want to help in developing asm4pg, please contact Ludovic Duvaux at ludovic.duvaux@inrae.fr diff --git a/job.sh b/job.sh index 2e5548e53e0832a2b934183ed8ff5f795d7cfeb5..df91a252545327cd0fd6648a2d9bc7c37053df4c 100644 --- a/job.sh +++ b/job.sh @@ -14,7 +14,7 @@ #SBATCH -o slurm_logs/snakemake.%N.%j.out #SBATCH -e slurm_logs/snakemake.%N.%j.err #SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=sukanya.denni@univ-rouen.fr +#SBATCH --mail-user=ken.smith@plantandfood.co.nz ################################################################################ # Useful information to print @@ -35,16 +35,29 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' +## get SNG_BIND abs path using python +function SNG_BIND_ABS_PATH { + SNG_BIND="$(python3 - <<END +import os + +abs_path = os.getcwd() +print(abs_path) + +END +)" +} +SNG_BIND_ABS_PATH ### variables CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" -MAX_CORES=10 +MAX_CORES=4 PROFILE=".config/snakemake_profile/slurm" -SNG_BIND="/gpfs/scratch/sdenni/wf/GenomAsm4pg" +SMK_PATH="workflow/pre-job_snakefiles" ### Module Loading: module purge -module load snakemake/6.5.1 +module load snakemake +module load singularity echo 'Starting Snakemake workflow' diff --git a/prejob.sh b/prejob.sh old mode 100644 new mode 100755 index d65a60ed5d574cd026944c172bac183f7e9a6c25..3095abc51d55cca68b81a2a1ec0ba737e0013cd1 --- a/prejob.sh +++ b/prejob.sh @@ -14,7 +14,7 @@ #SBATCH -o slurm_logs/snakemake_prejob.%N.%j.out #SBATCH -e slurm_logs/snakemake_prejob.%N.%j.err #SBATCH --mail-type=END,FAIL -#SBATCH --mail-user=sukanya.denni@univ-rouen.fr +#SBATCH --mail-user=ken.smith@plantandfood.co.nz ################################################################################ # Useful information to print @@ -35,16 +35,30 @@ echo 'scontrol show job:' scontrol show job $SLURM_JOB_ID echo '########################################' +## get SNG_BIND abs path using python +function SNG_BIND_ABS_PATH { + SNG_BIND="$(python3 - <<END +import os + +abs_path = os.getcwd() +print(abs_path) + +END +)" +} +SNG_BIND_ABS_PATH + ### variables CLUSTER_CONFIG=".config/snakemake_profile/slurm/cluster_config.yml" MAX_CORES=4 PROFILE=".config/snakemake_profile/slurm" SMK_PATH="workflow/pre-job_snakefiles" -SNG_BIND="/gpfs/scratch/sdenni/wf/GenomAsm4pg" + ### Module Loading: module purge -module load snakemake/6.5.1 +module load snakemake +module load singularity echo 'Starting Snakemake - data preparation' diff --git a/workflow/Snakefile b/workflow/Snakefile index 10ff558801f818f6bade7132bd3754d0c3a9a36e..595c81fe597792542c42de67f358884f2deec389 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,52 +1,53 @@ configfile: ".config/masterconfig.yaml" -res_path=config["root"] + "/" + config["resdir"] - ###### Include all scripts & rules necessary to run the workflow ###### ### Scripts -# get parameters from masterconfig include: "scripts/from_config/hifiasm_mode.py" include: "scripts/from_config/parameters.py" include: "scripts/from_config/target_list.py" +include: "scripts/path_helper.py" + +### paths +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] ### Rules -## PRE ASSEMBLY QC -include: "rules/01_qc.smk" -## ASSEMBLY +include: "rules/01_pre_asm_qc.smk" include: "rules/02_asm.smk" # Statistics include: "rules/03_asm_qc.smk" -include: "rules/03.5_A_qc_merqury.smk" +include: "rules/03.5_asm_qc_merqury.smk" # Purging include: "rules/04_purge_dups.smk" include: "rules/05_purged_asm_qc.smk" -include: "rules/05.5_PA_qc_merqury.smk" +include: "rules/05.5_pa_qc_merqury.smk" # Link final assembly include: "rules/06_sym_link_hap.smk" -## AUTOMATIC REPORT +# Automatic report include: "rules/07_report.smk" ###### get filenames for workflow ###### -IDS=config["IDS"] -bamIDS=check_bam(config["root"] + "/" + config["resdir"] + "/" + config["bamdir"], IDS) -fastqIDS=check_fastq(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"], IDS) -#### +if config["get_all_filenames"]: + IDS=get_files_id(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"]) +else: + IDS=config["IDS"] +bamIDS=check_bam(abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"], IDS) +fastqIDS=check_fastq(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"], IDS) RUNID = run_id(config["IDS"]) BID_RUN = run_BFid(bamIDS) FID_RUN = run_BFid(fastqIDS) -###### results path ###### -res_path=config["root"] + "/" + config["resdir"] - ###### Target files ###### -## raw data stats +### raw data stats longqc_output = expand(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC", zip, run=BID_RUN, Bid=bamIDS), fastqc_output = expand(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc.{ext}", zip, run=FID_RUN, Fid=fastqIDS, ext=["html", "zip"]) - - ### REPORT REP_ID = for_report(IDS) RUNID_REG = run_id(REP_ID) @@ -60,21 +61,17 @@ REP_TRIO_ID = for_report_trio(IDS) RUNID_TRIO = run_id(REP_TRIO_ID) BUSCO_LIN_TRIO = busco_lin(REP_TRIO_ID) - report_trio_output = expand(res_path + "/{runid}/report_trio_{id}.{lin}.html", zip, runid=RUNID_TRIO, id=REP_TRIO_ID, lin = BUSCO_LIN_TRIO) - - - ### SYM LINK -## symbolic link to final assembly +# symbolic link to final assembly symb_link1 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, runid=RUNID_REG, id=REP_ID, n=["1", "2"]) symb_link2 = expand(res_path + "/{runid}/{id}_hap{n}.fa", zip, runid=RUNID_TRIO, id=REP_TRIO_ID, n=["1", "2"]) -## PURGE_DUPS CUTOFFS GRAPH +# PURGE_DUPS CUTOFFS GRAPH cut_eval1 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, runid=RUNID_REG, id=REP_ID, n=["1", "2"]) cut_eval2 = expand(res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/cutoffs_graph_hap{n}.png", zip, @@ -106,7 +103,7 @@ rule_all_input_list = [ busco_purged_trio ] -#### target files +##### target files ##### rule all: input: all_input = rule_all_input_list \ No newline at end of file diff --git a/workflow/pre-job_snakefiles/Snakefile1.smk b/workflow/pre-job_snakefiles/Snakefile1.smk index 3314ac7c1b39fb9d80f6f04983765000543cbdb7..52629022404b57832b276cbe1a65c26e19d10d16 100644 --- a/workflow/pre-job_snakefiles/Snakefile1.smk +++ b/workflow/pre-job_snakefiles/Snakefile1.smk @@ -1,5 +1,7 @@ configfile: ".config/masterconfig.yaml" +include: "../scripts/path_helper.py" + ######################## Python functions ######################## import os, re # tar & tar.gz filename @@ -29,15 +31,21 @@ def data_ext(dir, id): return(str(config["data"] + "/{id}.tar.gz")) ######################## Snakemake ######################## + +### paths +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] + ### get filenames for workflow if config["get_all_tar_filename"]: IDS=get_tar_name(config["data"]) else: IDS=config["tarIDS"] -###### results path ###### -res_path=config["root"] + "/" + config["resdir"] - ### target files rule all: input: @@ -60,9 +68,9 @@ rule extract_targz_data: # move bam and fasta + fastq files rule move_files: params: - root=config["root"], - bam_path=config["root"] + "/" + config["resdir"] + "/" + config["bamdir"], - fastx_path=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"], + root= abs_root_path, + bam_path= abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"], + fastx_path= abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"], shell: "cd {params.root} && " "mkdir -p {params.bam_path} {params.fastx_path} && " diff --git a/workflow/pre-job_snakefiles/Snakefile2.smk b/workflow/pre-job_snakefiles/Snakefile2.smk index b55585c14009c1cf3cc328dac9789a28fcc405b3..9e655adcedc5c700b159b7774cce48dba979bc1e 100644 --- a/workflow/pre-job_snakefiles/Snakefile2.smk +++ b/workflow/pre-job_snakefiles/Snakefile2.smk @@ -1,5 +1,7 @@ configfile: ".config/masterconfig.yaml" +include: "../scripts/path_helper.py" + ######################## Python functions ######################## import os # bam filename @@ -14,26 +16,32 @@ def get_bams_name(dirpath): return(IDS) ######################## Snakemake ######################## -###### results path ###### -res_path=config["root"] + "/" + config["resdir"] + +### root path +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] ### get filenames -IDS=get_bams_name(config["root"] + "/" + config["resdir"] + "/" + config["bamdir"]) +IDS=get_bams_name(abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"]) ### target files rule all: input: - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz", id=IDS), - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz", id=IDS), + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) ### rules ## PacBio .bam conversion with smrtlink # .bam.pbi needed for bam_to_ conversion rules rule smrtlink_index: input: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam" output: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam.pbi" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam.pbi" container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" shell: @@ -42,12 +50,12 @@ rule smrtlink_index: # convert .bam to .fastq.gz rule smrtlink_bam_to_fastq: input: - bam = config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", + bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", bam_pbi = rules.smrtlink_index.output output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" params: - prefix=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" + prefix= abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" priority: 2 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" @@ -57,12 +65,12 @@ rule smrtlink_bam_to_fastq: # convert .bam to .fasta.gz rule smrtlink_bam_to_fasta: input: - bam = config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", + bam = abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{id}.bam", bam_pbi = rules.smrtlink_index.output output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" params: - prefix=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" + prefix= abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}" priority: 2 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/smrtlink9.0" diff --git a/workflow/pre-job_snakefiles/Snakefile3.smk b/workflow/pre-job_snakefiles/Snakefile3.smk index d0ab49218d2657c64f5967fe5f23f38f9d13f876..2df8324f8884e26495b4201c282770cedbbaaf97 100644 --- a/workflow/pre-job_snakefiles/Snakefile3.smk +++ b/workflow/pre-job_snakefiles/Snakefile3.smk @@ -1,5 +1,7 @@ configfile: ".config/masterconfig.yaml" +include: "../scripts/path_helper.py" + ######################## Python functions ######################## import os # fastq without fasta filename @@ -17,26 +19,32 @@ def get_fastq_name(dirpath): return(IDS) ######################## Snakemake ######################## -###### results path ###### -res_path=config["root"] + "/" + config["resdir"] + +### root path +if config["root"].startswith("."): + abs_root_path = get_abs_root_path() + res_path = get_res_path() +else: + abs_root_path = config["root"] + res_path = abs_root_path + "/" + config["resdir"] ### get filenames -IDS = get_fastq_name(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"]) +IDS = get_fastq_name(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"]) ### target files rule all: input: - expand(config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) + expand(abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz", id=IDS) ### rules # if only fastq : convert to fasta with seqtk + zip rule convert_to_fasta: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fastq.gz" output: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" params: - path=config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + path= abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] threads: 10 container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/seqtk1.3" diff --git a/workflow/rules/01_qc.smk b/workflow/rules/01_pre_asm_qc.smk similarity index 86% rename from workflow/rules/01_qc.smk rename to workflow/rules/01_pre_asm_qc.smk index 29bdc8c7d3346c6e7b5e1bb07572aa985c8fb045..e611b7c132a4ebb0d62fd9dd73ff6b46dc964f75 100644 --- a/workflow/rules/01_qc.smk +++ b/workflow/rules/01_pre_asm_qc.smk @@ -1,7 +1,7 @@ ### QC on .bam files with LongQC rule longqc: input: - config["root"] + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" + abs_root_path + "/" + config["resdir"] + "/" + config["bamdir"] + "/{Bid}.bam" output: directory(res_path + "/{Bid}/{run}/01_raw_data_QC/02_longQC") benchmark: @@ -18,7 +18,7 @@ rule longqc: ### QC on .fastq.gz files with FastQC rule fastqc: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{Fid}.fastq.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{Fid}.fastq.gz" output: multiext(res_path + "/{Fid}/{run}/01_raw_data_QC/01_fastQC/{Fid}_fastqc", ".html", ".zip") params: @@ -36,7 +36,7 @@ rule fastqc: rule genometools_on_raw_data: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: res_path + "/{runid}/01_raw_data_QC/03_genometools/{id}.RawStat.txt" priority: 1 @@ -52,7 +52,7 @@ rule genometools_on_raw_data: rule jellyfish: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: jf = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.jf", histo = res_path + "/{runid}/01_raw_data_QC/04_kmer/{id}.histo" diff --git a/workflow/rules/02_asm.smk b/workflow/rules/02_asm.smk index bcd9fc406a827c5aa3de782a419ee1464e8d567a..729feab3fe598a3873139568c4eb8cc757f47956 100644 --- a/workflow/rules/02_asm.smk +++ b/workflow/rules/02_asm.smk @@ -1,16 +1,15 @@ - ### haplotypes assembly # REGULAR MODE rule hifiasm: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.bp.hap2.p_ctg.gfa" params: - prefix = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -26,14 +25,14 @@ rule hifiasm_hic: r1 = get_r1, r2 = get_r2, # hifi reads - hifi = config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + hifi = abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.hic.hap2.p_ctg.gfa" params: - prefix= config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix= abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_hic_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -48,10 +47,10 @@ rule yak: p1 = get_p1, p2 = get_p2 output: - p1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent1.yak", - p2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent2.yak" + p1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent1.yak", + p2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/yak/{id}_parent2.yak" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_yak_benchmark.txt" container: "docker://registry.forgemia.inra.fr/asm4pg/genomasm4pg/hifiasm0.16.1" shell: @@ -63,14 +62,14 @@ rule hifiasm_trio: input: p1 = rules.yak.output.p1, p2 = rules.yak.output.p2, - child = config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + child = abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: - hap1 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap1.p_ctg.gfa", - hap2 = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap2.p_ctg.gfa" + hap1 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap1.p_ctg.gfa", + hap2 = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}.dip.hap2.p_ctg.gfa" params: - prefix = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" + prefix = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}" benchmark: - config["root"] + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_trio_benchmark.txt" + abs_root_path + "/" + config["resdir"] + "/{runid}/benchmark/{id}_hifiasm_trio_benchmark.txt" threads: 20 resources: mem_mb=250000 @@ -88,8 +87,8 @@ rule hap_gfa_to_fasta: hap1 = get_mode_hap1, hap2 = get_mode_hap2 output: - hap1_fa = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap1.fa.gz", - hap2_fa = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap2.fa.gz" + hap1_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap1.fa.gz", + hap2_fa = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap2.fa.gz" shell: """awk {TO_FA_CMD:q} {input.hap1} | pigz -p 1 > {output.hap1_fa} &&""" """awk {TO_FA_CMD:q} {input.hap2} | pigz -p 1 > {output.hap2_fa}""" \ No newline at end of file diff --git a/workflow/rules/03.5_A_qc_merqury.smk b/workflow/rules/03.5_asm_qc_merqury.smk similarity index 98% rename from workflow/rules/03.5_A_qc_merqury.smk rename to workflow/rules/03.5_asm_qc_merqury.smk index 973b30a6470039b238559e30921ad1351e2a1d73..33b44d33073f545cdd03157fa5d5524a8ab4a545 100644 --- a/workflow/rules/03.5_A_qc_merqury.smk +++ b/workflow/rules/03.5_asm_qc_merqury.smk @@ -2,7 +2,7 @@ ### create reads db necessary for merqury rule meryl: input: - config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: directory(res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC/merqury/{id}_reads-db_k21.meryl") benchmark: diff --git a/workflow/rules/03_asm_qc.smk b/workflow/rules/03_asm_qc.smk index c3ea246bfac35181459952f898d8fd8e4795f57f..f32a48053ba91a264d687778e061f1eeacdce878 100644 --- a/workflow/rules/03_asm_qc.smk +++ b/workflow/rules/03_asm_qc.smk @@ -1,5 +1,5 @@ # input haplotypes -HAP_FA_GZ = config["root"] + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" +HAP_FA_GZ = abs_root_path + "/" + config["resdir"] + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/{id}_hap{n}.fa.gz" # unzip fasta rule unzip_hap_fasta: diff --git a/workflow/rules/04_purge_dups.smk b/workflow/rules/04_purge_dups.smk index b5e512f41260e76955b74373e8de6630c99a7859..8da0fc3cf9e78d82f8ddf68018669857d6f95947 100644 --- a/workflow/rules/04_purge_dups.smk +++ b/workflow/rules/04_purge_dups.smk @@ -5,7 +5,7 @@ HAP_FA_GZ = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/00_assembly/ rule purge_dups_cutoffs: input: assembly = HAP_FA_GZ, - reads = config["root"] + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" + reads = abs_root_path + "/" + config["resdir"] + "/" + config["fastxdir"] + "/{id}.fasta.gz" output: paf = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/{id}_hap{n}.paf.gz", calcuts = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/00_assembly/{id}_hap{n}/calcuts.log", diff --git a/workflow/rules/05.5_PA_qc_merqury.smk b/workflow/rules/05.5_pa_qc_merqury.smk similarity index 100% rename from workflow/rules/05.5_PA_qc_merqury.smk rename to workflow/rules/05.5_pa_qc_merqury.smk diff --git a/workflow/rules/07_report.smk b/workflow/rules/07_report.smk index a2321652acc1f979f5f9d9ca5c13b564839457d5..20048ebd8acbe9dd7564b337c872bb7126ffd940 100644 --- a/workflow/rules/07_report.smk +++ b/workflow/rules/07_report.smk @@ -1,6 +1,5 @@ ### create report at the end of the workflow # path variables -res_path=config["root"] + "/" + config["resdir"] RAW_QC = res_path + "/{runid}/01_raw_data_QC" ASM_QC = res_path + "/{runid}/02_genome_assembly/01_raw_assembly/01_assembly_QC" P_ASM_QC = res_path + "/{runid}/02_genome_assembly/02_after_purge_dups_assembly/01_assembly_QC" diff --git a/workflow/scripts/from_config/target_list.py b/workflow/scripts/from_config/target_list.py index 2281c0e251f1cb430cbcb8f18a49233778596a15..75731f4652637f0c4024acdf186bc9f5839a47b8 100644 --- a/workflow/scripts/from_config/target_list.py +++ b/workflow/scripts/from_config/target_list.py @@ -69,4 +69,17 @@ def check_fastq(dirpath, IDlist): if splitResult[-2] == "fastq": filename= ".".join(splitResult[:-2]) IDS.append(filename) + return(IDS) + + +### FASTA +def get_files_id(dirpath): + IDS = [] + for file in os.listdir(dirpath): + splitResult = file.split(".") + ext = splitResult[-1] + if ext == "gz": + if splitResult[-2] == "fasta": + filename= ".".join(splitResult[:-2]) + IDS.append(filename) return(IDS) \ No newline at end of file diff --git a/workflow/scripts/path_helper.py b/workflow/scripts/path_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..c51b7bfd979532d57a51493bff623cb10050b664 --- /dev/null +++ b/workflow/scripts/path_helper.py @@ -0,0 +1,13 @@ +import os + +###### root path ###### +def get_abs_root_path(): + abs_root_path = os.path.abspath(config["root"]) + return(abs_root_path) + + +###### results path ###### +def get_res_path(): + abs_root_path = os.path.abspath(config["root"]) + res_path= abs_root_path + "/" + config["resdir"] + return(res_path) \ No newline at end of file