Skip to content

Commit 579a239

Browse files
authored
Merge pull request #9 from Quafadas/multi_select_columns
Multi select columns
2 parents 96e4960 + 054c525 commit 579a239

File tree

4 files changed

+256
-5
lines changed

4 files changed

+256
-5
lines changed

.scalafmt.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ runner.dialectOverride.allowSignificantIndentation = true
1010
rewrite.scala3.countEndMarkerLines = lastBlockOnly
1111
rewrite.scala3.insertEndMarkerMinLines = 1
1212
indent.main = 2
13-
maxColumn = 120
13+
maxColumn = 180
1414
project.excludeFilters = [ ".*/build\\.mill"]

build.sc build.mill

File renamed without changes.

scautable/src/csv.scala

+197-4
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,33 @@ import fansi.Str
1515
import scala.collection.View.FlatMap
1616
import io.github.quafadas.scautable.ConsoleFormat.*
1717

18+
import scala.math.Fractional.Implicits.*
1819

1920

2021
@experimental
2122
object CSV:
23+
24+
inline def constValueAll[A]: A =
25+
inline erasedValue[A] match
26+
case _: *:[h, t] => (constValueAll[h] *: constValueAll[t]).asInstanceOf[A]
27+
case _: EmptyTuple => EmptyTuple.asInstanceOf[A]
28+
case _ => constValue[A]
29+
30+
31+
private def listToTuple[A](list: List[A]): Tuple = list match
32+
case Nil => EmptyTuple
33+
case h :: t => h *: listToTuple(t)
34+
2235
type Concat[X <: String, Y <: Tuple] = X *: Y
2336

2437
type ConcatSingle[X, A] = X *: A *: EmptyTuple
2538

39+
type Negate[T <: Tuple] <: Tuple = T match
40+
case EmptyTuple => EmptyTuple
41+
case (head *: tail) => head match
42+
case false => true *: Negate[tail]
43+
case true => false *: Negate[tail]
44+
2645
type IsColumn[StrConst <: String, T <: Tuple] = T match
2746
case EmptyTuple => false
2847
case (head *: tail) => IsMatch[StrConst, head] match
@@ -62,6 +81,10 @@ object CSV:
6281
case false =>
6382
typeHead *: DropOneTypeAtName[nameTail, StrConst, typeTail]
6483

84+
type GetTypesAtNames[N <: Tuple, ForNames <: Tuple, T <: Tuple] <: Tuple = ForNames match
85+
case EmptyTuple => EmptyTuple
86+
case nameHead *: nameTail => GetTypeAtName[N, nameHead, T] *: GetTypesAtNames[N, nameTail, T]
87+
6588
type GetTypeAtName[N <: Tuple, StrConst <: String, T <: Tuple] = (N, T) match
6689
case (EmptyTuple, _) => EmptyTuple
6790
case (_, EmptyTuple) => EmptyTuple
@@ -87,6 +110,42 @@ object CSV:
87110
case A => true
88111
case _ => false
89112

113+
type IsNumeric[T] <: Boolean = T match
114+
case Option[a] => IsNumeric[a]
115+
case Int => true
116+
case Long => true
117+
case Float => true
118+
case Double => true
119+
case _ => false
120+
121+
type NumericColsIdx[T <: Tuple] <: Tuple =
122+
T match
123+
case EmptyTuple => EmptyTuple
124+
case (head *: tail) => IsNumeric[head] match
125+
case true => true *: NumericColsIdx[tail]
126+
case false => false *: NumericColsIdx[tail]
127+
128+
type SelectFromTuple[T <: Tuple, Bools <: Tuple] <: Tuple = T match
129+
case EmptyTuple => EmptyTuple
130+
case (head *: tail) => Bools match
131+
case (true *: boolTail) => head *: SelectFromTuple[tail, boolTail]
132+
case (false *: boolTail) => SelectFromTuple[tail, boolTail]
133+
134+
type AllAreColumns[T <: Tuple, K <: Tuple] <: Boolean = T match
135+
case EmptyTuple => true
136+
case head *: tail => IsColumn[head, K] match
137+
case true => AllAreColumns[tail, K]
138+
case false => false
139+
140+
type TupleContainsIdx[Search <: Tuple, In <: Tuple ] <: Tuple = In match
141+
case EmptyTuple => EmptyTuple
142+
case head *: tail => Search match
143+
case EmptyTuple => false *: EmptyTuple
144+
case searchHead *: searchTail => IsColumn[head, Search] match
145+
case true => true *: TupleContainsIdx[Search, tail]
146+
case false => false *: TupleContainsIdx[Search, tail]
147+
148+
90149

91150
type StringifyTuple[T >: Tuple] <: Tuple = T match
92151
case EmptyTuple => EmptyTuple
@@ -168,6 +227,135 @@ object CSV:
168227
}
169228
}
170229

230+
// inline def numericCols: Iterator[
231+
// NamedTuple.NamedTuple[
232+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
233+
// io.github.quafadas.scautable.CSV.TupleContainsIdx[
234+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
235+
// io.github.quafadas.scautable.CSV.NumericColsIdx[V1]],
236+
// K1]
237+
// ],
238+
// io.github.quafadas.scautable.CSV.SelectFromTuple[V1,
239+
// io.github.quafadas.scautable.CSV.TupleContainsIdx[
240+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
241+
// io.github.quafadas.scautable.CSV.NumericColsIdx[V1]],
242+
// K1]
243+
// ]
244+
// ]
245+
// ] =
246+
// val ev1 = summonInline[AllAreColumns[SelectFromTuple[K1, NumericColsIdx[V1]], K1] =:= true]
247+
// columns[SelectFromTuple[K1, NumericColsIdx[V1]]](using ev1)
248+
249+
// inline def nonNumericCols: Iterator[
250+
// NamedTuple.NamedTuple[
251+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
252+
// io.github.quafadas.scautable.CSV.TupleContainsIdx[
253+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
254+
// io.github.quafadas.scautable.CSV.Negate[
255+
// io.github.quafadas.scautable.CSV.NumericColsIdx[V1]]
256+
// ],
257+
// K1]
258+
// ],
259+
// io.github.quafadas.scautable.CSV.SelectFromTuple[V1,
260+
// io.github.quafadas.scautable.CSV.TupleContainsIdx[
261+
// io.github.quafadas.scautable.CSV.SelectFromTuple[K1,
262+
// io.github.quafadas.scautable.CSV.Negate[
263+
// io.github.quafadas.scautable.CSV.NumericColsIdx[V1]]
264+
// ],
265+
// K1]
266+
// ]
267+
// ]
268+
// ] =
269+
// val ev1 = summonInline[
270+
// AllAreColumns[SelectFromTuple[K1, Negate[NumericColsIdx[V1]]], K1] =:= true
271+
// ]
272+
// columns[SelectFromTuple[K1, Negate[NumericColsIdx[V1]]]](using ev1)
273+
274+
inline def resolve[ST <: Tuple]:SelectFromTuple[K1, TupleContainsIdx[ST, K1]] = ("Pclass", "Age", "SibSp", "Parch", "Fare").asInstanceOf[SelectFromTuple[K1, TupleContainsIdx[ST, K1]]]
275+
inline def resolveT[ST <: Tuple]:GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1] = (1, Some(2.0), 1, 1, 2.0).asInstanceOf[GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1]]
276+
277+
inline def resolveNT[ST <: Tuple]:NamedTuple[
278+
SelectFromTuple[K1, TupleContainsIdx[ST, K1]],
279+
GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1]
280+
] =
281+
(1, Some(2.0), 1, 1, 2.0)
282+
.withNames[("Pclass", "Age", "SibSp", "Parch", "Fare")]
283+
.asInstanceOf[
284+
NamedTuple[
285+
SelectFromTuple[K1, TupleContainsIdx[ST, K1]],
286+
GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1]
287+
]
288+
]
289+
290+
291+
292+
inline def columns[ST <: Tuple](using ev: AllAreColumns[ST, K1] =:= true):
293+
Iterator[
294+
NamedTuple[
295+
SelectFromTuple[K1, TupleContainsIdx[ST, K1]],
296+
GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1]
297+
]
298+
] =
299+
val headers = constValueTuple[K1].toList.map(_.toString())
300+
// val types = constValueTuple[SelectFromTuple[V1, TupleContainsIdx[ST, K1]]].toList.map(_.toString())
301+
val selectedHeaders = constValueTuple[SelectFromTuple[K1, TupleContainsIdx[ST, K1]]].toList.map(_.toString())
302+
303+
// Preserve the existing column order
304+
val idxes = selectedHeaders.map(headers.indexOf(_)).filterNot(_ == -1)
305+
306+
// println(s"headers $headers")
307+
// println(s"selectedHeaders $selectedHeaders")
308+
// println(s"idxes $idxes")
309+
310+
itr.map[NamedTuple[SelectFromTuple[K1, TupleContainsIdx[ST, K1]], GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]], V1]]]{
311+
(x: NamedTuple[K1, V1]) =>
312+
val tuple = x.toTuple
313+
314+
// println("in tuple")
315+
// println(tuple.toList.mkString(","))
316+
val selected: Tuple = idxes.foldRight(EmptyTuple: Tuple){
317+
(idx, acc) =>
318+
// println(tuple(idx))
319+
tuple(idx) *: acc
320+
}
321+
322+
val out = selected
323+
.withNames[SelectFromTuple[K1, TupleContainsIdx[ST, K1]]]
324+
.asInstanceOf[
325+
NamedTuple[
326+
SelectFromTuple[K1, TupleContainsIdx[ST, K1]],
327+
GetTypesAtNames[K1, SelectFromTuple[K1, TupleContainsIdx[ST, K1]] ,V1]
328+
]
329+
]
330+
331+
out
332+
}
333+
334+
inline def numericColSummary[S <: String](using ev: IsColumn[S, K1] =:= true, isNum: IsNumeric[GetTypeAtName[K1, S, V1]] =:= true, s: ValueOf[S], a: Fractional[GetTypeAtName[K1, S, V1]]) =
335+
val numericValues = itr.column[S].toList.asInstanceOf[List[GetTypeAtName[K1, S, V1]]]
336+
337+
val sortedValues = numericValues.sorted
338+
val size = sortedValues.size
339+
340+
def percentile(p: Double) : Double = {
341+
val rank = p * (size - 1)
342+
val lower = sortedValues(rank.toInt)
343+
val upper = sortedValues(math.ceil(rank).toInt)
344+
lower.toDouble + a.minus(upper, lower).toDouble * (rank - rank.toInt)
345+
}
346+
347+
val mean = numericValues.sum / a.fromInt(size)
348+
val min = sortedValues.head
349+
val max = sortedValues.last
350+
val variance = numericValues.map(x => a.minus(x, mean)).map(x => a.times(x, x)).sum / a.fromInt(size)
351+
352+
val percentiles = List(0.25, 0.5, 0.75).map(percentile)
353+
354+
val std = math.sqrt(variance.toDouble)
355+
356+
(mean, std, min, percentiles(0), percentiles(1), percentiles(2), max).withNames[("mean", "std", "min", "25%", "50%", "75%", "max")]
357+
358+
171359
inline def column[S <: String](using ev: IsColumn[S, K1] =:= true, s: ValueOf[S]): Iterator[GetTypeAtName[K1, S, V1]] = {
172360
val headers = constValueTuple[K1].toList.map(_.toString())
173361
/**
@@ -205,6 +393,15 @@ object CSV:
205393
inline def addColumn[S <: String, A](fct: (tup: NamedTuple.NamedTuple[K, V]) => A): Seq[NamedTuple[S *: K, A *: V]] =
206394
nt.toIterator.addColumn[S, A](fct).toSeq
207395

396+
inline def columns[ST <: Tuple](using ev: AllAreColumns[ST, K] =:= true):
397+
Seq[
398+
NamedTuple[
399+
SelectFromTuple[K, TupleContainsIdx[ST, K]],
400+
GetTypesAtNames[K, SelectFromTuple[K, TupleContainsIdx[ST, K]] ,V]
401+
]
402+
] =
403+
nt.toIterator.columns[ST](using ev).toSeq
404+
208405
inline def dropColumn[S <: String](using ev: IsColumn[S, K] =:= true, s: ValueOf[S]): Seq[NamedTuple[DropOneName[K, S], DropOneTypeAtName[K, S, V]]] =
209406
nt.toIterator.dropColumn[S].toSeq
210407

@@ -251,10 +448,6 @@ object CSV:
251448
hasMore
252449
end hasNext
253450

254-
private def listToTuple[A](list: List[A]): Tuple = list match
255-
case Nil => EmptyTuple
256-
case h :: t => h *: listToTuple(t)
257-
258451
def numericTypeTest(sample: Option[Int] = None) =
259452
val sampled = sample match
260453
case Some(n) =>

scautable/test/jvm/src/testJvm.scala

+58
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,64 @@ class CSVSuite extends munit.FunSuite:
7979
)
8080
}
8181

82+
test("columns") {
83+
def csv: CsvIterator[("col1", "col2", "col3")] = CSV.absolutePath(Generated.resourceDir0 + "simple.csv")
84+
85+
assert(
86+
!compileErrors("csv.columns[(\"notcol\")]").isEmpty()
87+
)
88+
89+
def cols= csv.mapColumn["col1", Int](_.toInt)
90+
91+
def selectCols: Iterator[(col1 : Int, col3 : String)] = cols.columns[("col1", "col3")]
92+
93+
assert(cols.toArray().head.col1 == 1)
94+
assert(cols.toArray().head.col3 == "7")
95+
assert(cols.toArray().last.col1 == 5)
96+
assert(cols.toArray().last.col3 == "9")
97+
98+
// def numerics: Iterator[(col1 : Int)] = cols.numericCols
99+
100+
// def numerics2: Iterator[(col1 : Int, col2 : Double)] = cols.mapColumn["col2", Double](_.toDouble).numericCols
101+
// def numerics3: Iterator[(col1 : Float, col2 : Double)] = numerics2.mapColumn["col1", Float](_.toFloat).numericCols
102+
// def numerics4: Iterator[(col1 : Option[Int], col2 : Option[Double])] = csv
103+
// .mapColumn["col2", Option[Double]](_.toDoubleOption)
104+
// .mapColumn["col1", Option[Int]](_.toIntOption)
105+
// .numericCols
106+
// def nonNumeric: Iterator[(col2 : String, col3 : String)] = cols.nonNumericCols
107+
}
108+
109+
test("titanic cols") {
110+
enum Gender :
111+
case Male, Female
112+
113+
def titanic = CSV.absolutePath(Generated.resourceDir0 + "titanic.csv")
114+
def data = titanic
115+
.mapColumn["Sex", Gender]((x: String) => Gender.valueOf(x.capitalize))
116+
.dropColumn["PassengerId"]
117+
.mapColumn["Age", Option[Double]](_.toDoubleOption)
118+
.mapColumn["Survived", Boolean](_ == "1")
119+
.mapColumn["Pclass", Int](_.toInt)
120+
.mapColumn["SibSp", Int](_.toInt)
121+
.mapColumn["Parch", Int](_.toInt)
122+
.mapColumn["Fare", Double](_.toDouble)
123+
124+
125+
126+
val k: ("Pclass", "Age", "SibSp", "Parch", "Fare") = data.resolve[("Fare", "Pclass", "Age", "SibSp", "Parch")]
127+
val kT: (Int, Option[Double], Int, Int, Double) = data.resolveT[("Fare", "Pclass", "Age", "SibSp", "Parch")]
128+
val kNT: (Pclass : Int, Age : Option[Double], SibSp : Int, Parch : Int, Fare : Double) = data.resolveNT[("Fare", "Pclass", "Age", "SibSp", "Parch")]
129+
println(Array(kNT).consoleFormatNt)
130+
131+
val numericols: List[(Pclass : Int, Age : Option[Double], SibSp : Int, Parch : Int, Fare : Double)] = data.columns[("Fare", "Pclass", "Age", "SibSp", "Parch")].take(2).toList
132+
133+
// val numericols: Array[(Pclass : Int, Age : Option[Double], SibSp : Int, Parch : Int, Fare : Double)] = data.columns[("Fare", "Pclass", "Age", "SibSp", "Parch")].take(2).toList.toArray
134+
135+
// This will fail, if columsn method doesn't return in the right oder.
136+
137+
println(numericols.consoleFormatNt)
138+
139+
}
82140

83141
test("column") {
84142
def csv: CsvIterator[("col1", "col2", "col3")] = CSV.absolutePath(Generated.resourceDir0 + "simple.csv")

0 commit comments

Comments
 (0)