diff --git a/Cats.html b/Cats.html index 4045e6f3..17a75642 100644 --- a/Cats.html +++ b/Cats.html @@ -166,7 +166,7 @@

count <- typedDs.count[Action]() } yield (sample, count) // result: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$Lambda$11608/0x000000080332d840@2776a61 +// cats.data.Kleisli$$Lambda$11657/0x0000000803456840@3953b3f4 // )

As with Job, note that nothing has been run yet. The effect has been properly suspended. To run our program, we must first supply the SparkSession to the ReaderT layer and then @@ -193,7 +193,7 @@

yield r // resultWithDescription: Action[(Seq[(Int, String)], Long)] = Kleisli( -// cats.data.Kleisli$$$Lambda$13104/0x00000008038cd040@2dc9e098 +// cats.data.Kleisli$$$Lambda$13144/0x00000008038d5040@230a0c12 // ) resultWithDescription.run(spark).unsafeRunSync() diff --git a/FeatureOverview.html b/FeatureOverview.html index 13ccf703..87940b47 100644 --- a/FeatureOverview.html +++ b/FeatureOverview.html @@ -696,7 +696,7 @@

// priceModifier: (String, Double) => Double = <function2> val udf = aptTypedDs.makeUDF(priceModifier) -// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$14199/0x0000000803d39840@15ce8e4e +// udf: (frameless.TypedColumn[Apartment, String], frameless.TypedColumn[Apartment, Double]) => frameless.TypedColumn[Apartment, Double] = frameless.functions.Udf$$Lambda$14198/0x0000000803cd0840@5a22008b val aptds = aptTypedDs // For shorter expressions // aptds: TypedDataset[Apartment] = [city: string, surface: int ... 2 more fields] diff --git a/Injection.html b/Injection.html index 3ad63424..60c77bab 100644 --- a/Injection.html +++ b/Injection.html @@ -144,7 +144,7 @@

// people: Seq[Person] = List( // Person( // 42, -// java.util.GregorianCalendar[time=1725583260913,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=8,WEEK_OF_YEAR=36,WEEK_OF_MONTH=1,DAY_OF_MONTH=6,DAY_OF_YEAR=250,DAY_OF_WEEK=6,DAY_OF_WEEK_IN_MONTH=1,AM_PM=0,HOUR=0,HOUR_OF_DAY=0,MINUTE=41,SECOND=0,MILLISECOND=913,ZONE_OFFSET=0,DST_OFFSET=0] +// java.util.GregorianCalendar[time=1725639947251,areFieldsSet=true,areAllFieldsSet=true,lenient=true,zone=sun.util.calendar.ZoneInfo[id="Etc/UTC",offset=0,dstSavings=0,useDaylight=false,transitions=0,lastRule=null],firstDayOfWeek=1,minimalDaysInFirstWeek=1,ERA=1,YEAR=2024,MONTH=8,WEEK_OF_YEAR=36,WEEK_OF_MONTH=1,DAY_OF_MONTH=6,DAY_OF_YEAR=250,DAY_OF_WEEK=6,DAY_OF_WEEK_IN_MONTH=1,AM_PM=1,HOUR=4,HOUR_OF_DAY=16,MINUTE=25,SECOND=47,MILLISECOND=251,ZONE_OFFSET=0,DST_OFFSET=0] // ) // )

And an instance of a TypedDataset:

@@ -167,7 +167,7 @@

cal } } -// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@3af49f18 +// calendarToLongInjection: AnyRef with Injection[Calendar, Long] = repl.MdocSession$MdocApp0$$anon$1@1c56d8a2

We can be less verbose using the Injection.apply function:

import frameless._
 
@@ -180,7 +180,7 @@ 

cal.setTime(new java.util.Date(l)) cal }) -// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@4316a918

+// calendarToLongInjection: Injection[Calendar, Long] = frameless.Injection$$anon$1@316f02b8

Now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, birthday: bigint]
@@ -212,7 +212,7 @@

case 2 => Female case 3 => Other }) -// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@715f96a1 +// genderToInt: Injection[Gender, Int] = frameless.Injection$$anon$1@73840342

And now we can create our TypedDataset:

val personDS = TypedDataset.create(people)
 // personDS: TypedDataset[Person] = [age: int, gender: int]
diff --git a/Job.html b/Job.html index 6dff43a6..87b79baa 100644 --- a/Job.html +++ b/Job.html @@ -156,7 +156,7 @@

Job[A]

count <- ds.count() sample <- ds.take((count/5).toInt) } yield sample -// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@2e5aa129 +// countAndTakeJob: frameless.Job[Seq[Int]] = frameless.Job$$anon$3@7974a710 countAndTakeJob.run() // res1: Seq[Int] = WrappedArray(1, 2, 3, 4) @@ -167,7 +167,7 @@

Job[A]

def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min) val finalJob = computeMinOfSample(countAndTakeJob) -// finalJob: Job[Int] = frameless.Job$$anon$2@15065659 +// finalJob: Job[Int] = frameless.Job$$anon$2@3744f6e4

Now we can execute this new job by specifying a group-id and a description. This allows the programmer to see this information on the Spark UI and help track, say, performance issues.

diff --git a/TypedDatasetVsSparkDataset.html b/TypedDatasetVsSparkDataset.html index 8c4ece0b..a47e2253 100644 --- a/TypedDatasetVsSparkDataset.html +++ b/TypedDatasetVsSparkDataset.html @@ -160,8 +160,8 @@

Comparing T // | i| j| // +---+---+ // | 10| W| -// |100| E| // | 1| Q| +// |100| E| // +---+---+ //

The value ds holds the content of the initialDs read from a parquet file. diff --git a/TypedEncoder.html b/TypedEncoder.html index 57dd3ee8..8e7ec99e 100644 --- a/TypedEncoder.html +++ b/TypedEncoder.html @@ -206,7 +206,7 @@

Typed Encoders in Frameless// ds: TypedDataset[Foo] = [i: int, b: struct<d: double, s: string>] ds.collect() -// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@6c3acdd +// res3: frameless.Job[Seq[Foo]] = frameless.Job$$anon$4@1f6aec07

But any non-encodable in the case class hierarchy will be detected at compile time:

case class BarDate(d: Double, s: String, t: java.util.Calendar)
 case class FooDate(i: Int, b: BarDate)
diff --git a/TypedML.html b/TypedML.html index d618667f..8ea3421a 100644 --- a/TypedML.html +++ b/TypedML.html @@ -176,7 +176,7 @@

case class Features(squareFeet: Double, hasGarden: Boolean) val assembler = TypedVectorAssembler[Features] -// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@71bdf5cb +// assembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@6cbe0491 case class HouseDataWithFeatures(squareFeet: Double, hasGarden: Boolean, price: Double, features: Vector) val trainingDataWithFeatures = assembler.transform(trainingData).as[HouseDataWithFeatures] @@ -206,10 +206,10 @@

case class RFInputs(price: Double, features: Vector) val rf = TypedRandomForestRegressor[RFInputs] -// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@25bb6f25 +// rf: TypedRandomForestRegressor[RFInputs] = frameless.ml.regression.TypedRandomForestRegressor@710d68fa val model = rf.fit(trainingDataWithFeatures).run() -// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@5a85c7ef +// model: AppendTransformer[RFInputs, TypedRandomForestRegressor.Outputs, org.apache.spark.ml.regression.RandomForestRegressionModel] = frameless.ml.TypedEstimator$$anon$1@2192a871

TypedRandomForestRegressor[RFInputs] compiles only if RFInputs contains only one field of type Double (the label) and one field of type Vector (the features):

case class WrongRFInputs(labelOfWrongType: String, features: Vector)
@@ -269,7 +269,7 @@

case class Features(price: Double, squareFeet: Double) val vectorAssembler = TypedVectorAssembler[Features] -// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@64815bc9 +// vectorAssembler: TypedVectorAssembler[Features] = frameless.ml.feature.TypedVectorAssembler@161188a2 case class HouseDataWithFeatures(squareFeet: Double, city: String, price: Double, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingData).as[HouseDataWithFeatures] @@ -277,11 +277,11 @@

case class StringIndexerInput(city: String) val indexer = TypedStringIndexer[StringIndexerInput] -// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@4ad917ee +// indexer: TypedStringIndexer[StringIndexerInput] = frameless.ml.feature.TypedStringIndexer@4f8dc128 indexer.estimator.setHandleInvalid("keep") -// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_4455268e7725 +// res12: org.apache.spark.ml.feature.StringIndexer = strIdx_1712b46c162d val indexerModel = indexer.fit(dataWithFeatures).run() -// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@43058dc4 +// indexerModel: AppendTransformer[StringIndexerInput, TypedStringIndexer.Outputs, org.apache.spark.ml.feature.StringIndexerModel] = frameless.ml.TypedEstimator$$anon$1@75de8e5f case class HouseDataWithFeaturesAndIndex( squareFeet: Double, @@ -295,10 +295,10 @@

case class RFInputs(cityIndexed: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] -// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@238a4f15 +// rf: TypedRandomForestClassifier[RFInputs] = frameless.ml.classification.TypedRandomForestClassifier@155d814 val model = rf.fit(indexedData).run() -// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@6d0774a3 +// model: AppendTransformer[RFInputs, TypedRandomForestClassifier.Outputs, org.apache.spark.ml.classification.RandomForestClassificationModel] = frameless.ml.TypedEstimator$$anon$1@519c13c0

Prediction

We now want to predict city for testData using the previously trained model. Like the Spark ML API, @@ -330,7 +330,7 @@

case class IndexToStringInput(predictedCityIndexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels) -// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@4740ff89 +// indexToString: TypedIndexToString[IndexToStringInput] = frameless.ml.feature.TypedIndexToString@35c731e3 case class HouseCityPrediction( features: Vector,