forked from ga4gh/ga4gh-schemas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommon.avdl
135 lines (122 loc) · 4.29 KB
/
common.avdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
@namespace("org.ga4gh.models")
/**
This protocol defines common types used in the other GA4GH protocols. It does
not have any methods; it is merely a library of types.
*/
protocol Common {
/**
Indicates the DNA strand associate for some data item.
* `NEG_STRAND`: The negative (-) strand.
* `POS_STRAND`: The postive (+) strand.
*/
enum Strand {
NEG_STRAND,
POS_STRAND
}
/**
A `Position` is an unoriented base in some `Reference`. A `Position` is
represented by a `Reference` name, and a base number on that `Reference`
(0-based).
*/
record Position {
/**
The name of the `Reference` on which the `Position` is located.
*/
string referenceName;
/**
The 0-based offset from the start of the forward strand for that `Reference`.
Genomic positions are non-negative integers less than `Reference` length.
*/
long position;
/**
Strand the position is associated with.
*/
Strand strand;
}
/**
Identifier from a public database
*/
record ExternalIdentifier {
/**
The source of the identifier.
(e.g. `Ensembl`)
*/
string database;
/**
The ID defined by the external database.
(e.g. `ENST00000000000`)
*/
string identifier;
/**
The version of the object or the database
(e.g. `78`)
*/
string version;
}
/**
An enum for the different types of CIGAR alignment operations that exist.
Used wherever CIGAR alignments are used. The different enumerated values
have the following usage:
* `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be
aligned to the reference without evidence of an INDEL. Unlike the
`SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH`
operator does not indicate whether the reference and read sequences are an
exact match. This operator is equivalent to SAM's `M`.
* `INSERT`: The insert operator indicates that the read contains evidence of
bases being inserted into the reference. This operator is equivalent to
SAM's `I`.
* `DELETE`: The delete operator indicates that the read contains evidence of
bases being deleted from the reference. This operator is equivalent to
SAM's `D`.
* `SKIP`: The skip operator indicates that this read skips a long segment of
the reference, but the bases have not been deleted. This operator is
commonly used when working with RNA-seq data, where reads may skip long
segments of the reference between exons. This operator is equivalent to
SAM's 'N'.
* `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end
of a read have not been considered during alignment. This may occur if the
majority of a read maps, except for low quality bases at the start/end of
a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped
will still be stored in the read.
* `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of
a read have been omitted from this alignment. This may occur if this linear
alignment is part of a chimeric alignment, or if the read has been trimmed
(e.g., during error correction, or to trim poly-A tails for RNA-seq). This
operator is equivalent to SAM's 'H'.
* `PAD`: The pad operator indicates that there is padding in an alignment.
This operator is equivalent to SAM's 'P'.
* `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned
sequence exactly matches the reference (e.g., all bases are equal to the
reference bases). This operator is equivalent to SAM's '='.
* `SEQUENCE_MISMATCH`: This operator indicates that this portion of the
aligned sequence is an alignment match to the reference, but a sequence
mismatch (e.g., the bases are not equal to the reference). This can
indicate a SNP or a read error. This operator is equivalent to SAM's 'X'.
*/
enum CigarOperation {
ALIGNMENT_MATCH,
INSERT,
DELETE,
SKIP,
CLIP_SOFT,
CLIP_HARD,
PAD,
SEQUENCE_MATCH,
SEQUENCE_MISMATCH
}
/**
A structure for an instance of a CIGAR operation.
*/
record CigarUnit {
/** The operation type. */
CigarOperation operation;
/** The number of bases that the operation runs for. */
long operationLength;
/**
`referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`)
and deletions (`DELETE`). Filling this field replaces the MD tag.
If the relevant information is not available, leave this field as `null`.
*/
union { null, string } referenceSequence = null;
}
}