forked from ga4gh/ga4gh-schemas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvariants.avdl
243 lines (195 loc) · 6.46 KB
/
variants.avdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
@namespace("org.ga4gh.models")
/**
Defines types used by the GA4GH Variants API.
*/
protocol Variants {
import idl "common.avdl";
/**
This metadata represents VCF header information.
*/
record VariantSetMetadata {
/** The top-level key. */
string key;
/** The value field for simple metadata. */
string value;
/**
User-provided ID field, not enforced by this API.
Two or more pieces of structured metadata with identical
id and key fields are considered equivalent.
*/
string id;
/** The type of data. */
string type;
/**
The number of values that can be included in a field described by this
metadata.
*/
string number;
/** A textual description of this metadata. */
string description;
/** Remaining structured metadata key-value pairs. */
map<array<string>> info = {};
}
/**
`Variant` and `CallSet` both belong to a `VariantSet`.
`VariantSet` belongs to a `Dataset`.
The variant set is equivalent to a VCF file.
*/
record VariantSet {
/** The variant set ID. */
string id;
/** The variant set name. */
union { null, string } name = null;
/** The ID of the dataset this variant set belongs to. */
string datasetId;
/**
The reference set the variants in this variant set are using.
*/
string referenceSetId;
/**
The metadata associated with this variant set. This is equivalent to
the VCF header information not already presented in first class fields.
*/
array<VariantSetMetadata> metadata = [];
}
/**
A `CallSet` is a collection of variant calls for a particular sample.
It belongs to a `VariantSet`. This is equivalent to one column in VCF.
*/
record CallSet {
/** The call set ID. */
string id;
/** The call set name. */
union { null, string } name = null;
/** The sample this call set's data was generated from. */
union { null, string } sampleId;
/** The IDs of the variant sets this call set has calls in. */
array<string> variantSetIds = [];
/** The date this call set was created in milliseconds from the epoch. */
union { null, long } created = null;
/**
The time at which this call set was last updated in
milliseconds from the epoch.
*/
union { null, long } updated = null;
/**
A map of additional call set information.
*/
map<array<string>> info = {};
}
/**
A `Call` represents the determination of genotype with respect to a
particular `Variant`.
It may include associated information such as quality
and phasing. For example, a call might assign a probability of 0.32 to
the occurrence of a SNP named rs1234 in a call set with the name NA12345.
*/
record Call {
/**
The name of the call set this variant call belongs to.
If this field is not present, the ordering of the call sets from a
`SearchCallSetsRequest` over this `VariantSet` is guaranteed to match
the ordering of the calls on this `Variant`.
The number of results will also be the same.
*/
union { null, string } callSetName = null;
/**
The ID of the call set this variant call belongs to.
If this field is not present, the ordering of the call sets from a
`SearchCallSetsRequest` over this `VariantSet` is guaranteed to match
the ordering of the calls on this `Variant`.
The number of results will also be the same.
*/
union { null, string} callSetId = null;
/**
The genotype of this variant call.
A 0 value represents the reference allele of the associated `Variant`. Any
other value is a 1-based index into the alternate alleles of the associated
`Variant`.
If a variant had a referenceBases field of "T", an alternateBases
value of ["A", "C"], and the genotype was [2, 1], that would mean the call
represented the heterozygous value "CA" for this variant. If the genotype
was instead [0, 1] the represented value would be "TA". Ordering of the
genotype values is important if the phaseset field is present.
*/
array<int> genotype = [];
/**
If this field is not null, this variant call's genotype ordering implies
the phase of the bases and is consistent with any other variant calls on
the same contig which have the same phaseset string.
*/
union { null, string } phaseset = null;
/**
The genotype likelihoods for this variant call. Each array entry
represents how likely a specific genotype is for this call as
log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The
value ordering is defined by the GL tag in the VCF spec.
*/
array<double> genotypeLikelihood = [];
/**
A map of additional variant call information.
*/
map<array<string>> info = {};
}
/**
A `Variant` represents a change in DNA sequence relative to some reference.
For example, a variant could represent a SNP or an insertion.
Variants belong to a `VariantSet`.
This is equivalent to a row in VCF.
*/
record Variant {
/** The variant ID. */
string id;
/**
The ID of the `VariantSet` this variant belongs to. This transitively defines
the `ReferenceSet` against which the `Variant` is to be interpreted.
*/
string variantSetId;
/** Names for the variant, for example a RefSNP ID. */
array<string> names = [];
/** The date this variant was created in milliseconds from the epoch. */
union { null, long } created = null;
/**
The time at which this variant was last updated in
milliseconds from the epoch.
*/
union { null, long } updated = null;
/**
The reference on which this variant occurs.
(e.g. `chr20` or `X`)
*/
string referenceName;
/**
The start position at which this variant occurs (0-based).
This corresponds to the first base of the string of reference bases.
Genomic positions are non-negative integers less than reference length.
Variants spanning the join of circular genomes are represented as
two variants one on each side of the join (position 0).
*/
long start;
/**
The end position (exclusive), resulting in [start, end) closed-open interval.
This is typically calculated by `start + referenceBases.length`.
*/
long end;
/**
The reference bases for this variant. They start at the given start position.
*/
string referenceBases;
/**
The bases that appear instead of the reference bases. Multiple alternate
alleles are possible.
*/
array<string> alternateBases = [];
/**
A map of additional variant information.
*/
map<array<string>> info = {};
/**
The variant calls for this particular variant. Each one represents the
determination of genotype with respect to this variant. `Call`s in this array
are implicitly associated with this `Variant`.
*/
array<Call> calls = [];
}
}