package com.amazon.deequ.profiles;

import com.amazon.deequ.analyzers.Analyzer;
import com.amazon.deequ.analyzers.KLLParameters;
import com.amazon.deequ.analyzers.KLLSketch;
import com.amazon.deequ.analyzers.Maximum;
import com.amazon.deequ.analyzers.Maximum$;
import com.amazon.deequ.analyzers.Mean;
import com.amazon.deequ.analyzers.Mean$;
import com.amazon.deequ.analyzers.Minimum;
import com.amazon.deequ.analyzers.Minimum$;
import com.amazon.deequ.analyzers.Size;
import com.amazon.deequ.analyzers.Size$;
import com.amazon.deequ.analyzers.StandardDeviation;
import com.amazon.deequ.analyzers.StandardDeviation$;
import com.amazon.deequ.analyzers.StandardScanShareableAnalyzer;
import com.amazon.deequ.analyzers.Sum;
import com.amazon.deequ.analyzers.Sum$;
import com.amazon.deequ.analyzers.runners.AnalysisRunBuilder;
import com.amazon.deequ.analyzers.runners.AnalysisRunner$;
import com.amazon.deequ.analyzers.runners.AnalyzerContext;
import com.amazon.deequ.analyzers.runners.AnalyzerContext$;
import com.amazon.deequ.analyzers.runners.ReusingNotPossibleResultsMissingException;
import com.amazon.deequ.metrics.Distribution;
import com.amazon.deequ.metrics.Metric;
import com.amazon.deequ.repository.MetricsRepository;
import com.amazon.deequ.repository.ResultKey;
import org.apache.spark.rdd.RDD$;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.BooleanType$;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DoubleType$;
import org.apache.spark.sql.types.FloatType$;
import org.apache.spark.sql.types.IntegerType$;
import org.apache.spark.sql.types.LongType$;
import org.apache.spark.sql.types.ShortType$;
import org.apache.spark.sql.types.StringType$;
import org.apache.spark.sql.types.StructType;
import scala.Array$;
import scala.Enumeration;
import scala.Function1;
import scala.None$;
import scala.Option;
import scala.Predef$;
import scala.Predef$DummyImplicit$;
import scala.StringContext;
import scala.Tuple2;
import scala.collection.IterableLike;
import scala.collection.Seq;
import scala.collection.Seq$;
import scala.collection.TraversableLike;
import scala.collection.TraversableOnce;
import scala.collection.generic.GenericTraversableTemplate;
import scala.collection.immutable.Iterable$;
import scala.collection.immutable.Map;
import scala.collection.immutable.Map$;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.Set;
import scala.collection.mutable.StringBuilder;
import scala.math.Ordering$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxesRunTime;
import scala.runtime.ObjectRef;

/* compiled from: ColumnProfiler.scala */
/* loaded from: input_file:com/amazon/deequ/profiles/ColumnProfiler$.class */
public final class ColumnProfiler$ {
    public static final ColumnProfiler$ MODULE$ = null;
    private final int DEFAULT_CARDINALITY_THRESHOLD;

    static {
        new ColumnProfiler$();
    }

    public int DEFAULT_CARDINALITY_THRESHOLD() {
        return this.DEFAULT_CARDINALITY_THRESHOLD;
    }

    public ColumnProfiles profile(Dataset<Row> dataset, Option<Seq<String>> option, boolean z, int i, Option<MetricsRepository> option2, Option<ResultKey> option3, boolean z2, Option<ResultKey> option4, boolean z3, boolean z4, boolean z5, Option<KLLParameters> option5, Map<String, Enumeration.Value> map) {
        Map<String, Distribution> empty2;
        option.foreach(new ColumnProfiler$$anonfun$profile$1(dataset));
        Seq<String> relevantColumns = getRelevantColumns(dataset.schema(), option);
        if (z) {
            Predef$.MODULE$.println("### PROFILING: Computing generic column statistics in pass (1/3)...");
        }
        GenericColumnStatistics extractGenericStatistics = extractGenericStatistics(relevantColumns, dataset.schema(), setMetricsRepositoryConfigurationIfNecessary(AnalysisRunner$.MODULE$.onData(dataset).addAnalyzers(getAnalyzersForGenericStats(dataset.schema(), relevantColumns, map)).addAnalyzer(new Size(Size$.MODULE$.apply$default$1())), option2, option3, z2, option4).run(), map);
        if (z) {
            Predef$.MODULE$.println("### PROFILING: Computing numeric column statistics in pass (2/3)...");
        }
        NumericColumnStatistics extractNumericStatistics = extractNumericStatistics(setMetricsRepositoryConfigurationIfNecessary(AnalysisRunner$.MODULE$.onData(castNumericStringColumns(relevantColumns, dataset, extractGenericStatistics)).addAnalyzers(getAnalyzersForSecondPass(relevantColumns, extractGenericStatistics, z5, option5, z3)), option2, option3, z2, option4).run());
        if (true == z4) {
            if (z) {
                Predef$.MODULE$.println("### PROFILING: Computing histograms of low-cardinality columns in pass (3/3)...");
            }
            Seq<String> findTargetColumnsForHistograms = findTargetColumnsForHistograms(dataset.schema(), extractGenericStatistics, i);
            AnalyzerContext analyzerContextWithHistogramResultsForReusingIfNecessary = getAnalyzerContextWithHistogramResultsForReusingIfNecessary(option2, option3, findTargetColumnsForHistograms);
            empty2 = getHistogramsForThirdPass(dataset, (Seq) findTargetColumnsForHistograms.filter(new ColumnProfiler$$anonfun$17(analyzerContextWithHistogramResultsForReusingIfNecessary)), analyzerContextWithHistogramResultsForReusingIfNecessary, z, z2, option2, option4);
        } else {
            empty2 = Predef$.MODULE$.Map().empty2();
        }
        return createProfiles(relevantColumns, extractGenericStatistics, extractNumericStatistics, new CategoricalColumnStatistics(empty2));
    }

    public Option<Seq<String>> profile$default$2() {
        return None$.MODULE$;
    }

    public boolean profile$default$3() {
        return false;
    }

    public int profile$default$4() {
        return DEFAULT_CARDINALITY_THRESHOLD();
    }

    public Option<MetricsRepository> profile$default$5() {
        return None$.MODULE$;
    }

    public Option<ResultKey> profile$default$6() {
        return None$.MODULE$;
    }

    public boolean profile$default$7() {
        return false;
    }

    public Option<ResultKey> profile$default$8() {
        return None$.MODULE$;
    }

    public boolean profile$default$9() {
        return true;
    }

    public boolean profile$default$10() {
        return true;
    }

    public boolean profile$default$11() {
        return false;
    }

    public Option<KLLParameters> profile$default$12() {
        return None$.MODULE$;
    }

    public Map<String, Enumeration.Value> profile$default$13() {
        return Predef$.MODULE$.Map().empty2();
    }

    private Seq<String> getRelevantColumns(StructType structType, Option<Seq<String>> option) {
        return (Seq) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(structType.fields()).filter(new ColumnProfiler$$anonfun$getRelevantColumns$1(option))).map(new ColumnProfiler$$anonfun$getRelevantColumns$2(), Array$.MODULE$.fallbackCanBuildFrom(Predef$DummyImplicit$.MODULE$.dummyImplicit()));
    }

    private Seq<Analyzer<?, Metric<?>>> getAnalyzersForGenericStats(StructType structType, Seq<String> seq, Map<String, Enumeration.Value> map) {
        return (Seq) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(structType.fields()).filter(new ColumnProfiler$$anonfun$getAnalyzersForGenericStats$1(seq))).flatMap(new ColumnProfiler$$anonfun$getAnalyzersForGenericStats$2(map), Array$.MODULE$.fallbackCanBuildFrom(Predef$DummyImplicit$.MODULE$.dummyImplicit()));
    }

    private Seq<Analyzer<?, Metric<?>>> getAnalyzersForSecondPass(Seq<String> seq, GenericColumnStatistics genericColumnStatistics, boolean z, Option<KLLParameters> option, boolean z2) {
        Seq filter = seq.filter(new ColumnProfiler$$anonfun$18(genericColumnStatistics));
        return (Seq) filter.flatMap(new ColumnProfiler$$anonfun$getAnalyzersForSecondPass$1(z, option, z2, filter), Seq$.MODULE$.canBuildFrom());
    }

    private Option<KLLParameters> getAnalyzersForSecondPass$default$4() {
        return None$.MODULE$;
    }

    public Seq<Analyzer<?, Metric<?>>> com$amazon$deequ$profiles$ColumnProfiler$$getNumericColAnalyzers(String str, boolean z, Option<KLLParameters> option, boolean z2, Seq<String> seq) {
        return (Seq) ((TraversableLike) ((Seq) Seq$.MODULE$.apply(Predef$.MODULE$.wrapRefArray(new StandardScanShareableAnalyzer[]{new Minimum(str, Minimum$.MODULE$.apply$default$2()), new Maximum(str, Maximum$.MODULE$.apply$default$2()), new Mean(str, Mean$.MODULE$.apply$default$2()), new StandardDeviation(str, StandardDeviation$.MODULE$.apply$default$2()), new Sum(str, Sum$.MODULE$.apply$default$2())}))).$plus$plus(z ? (Seq) Seq$.MODULE$.apply(Predef$.MODULE$.wrapRefArray(new KLLSketch[]{new KLLSketch(str, option)})) : (Seq) Seq$.MODULE$.empty(), Seq$.MODULE$.canBuildFrom())).$plus$plus(z2 ? (Seq) seq.map(new ColumnProfiler$$anonfun$19(str), Seq$.MODULE$.canBuildFrom()) : (Seq) Seq$.MODULE$.empty(), Seq$.MODULE$.canBuildFrom());
    }

    /* JADX WARN: Multi-variable type inference failed */
    private AnalysisRunBuilder setMetricsRepositoryConfigurationIfNecessary(AnalysisRunBuilder analysisRunBuilder, Option<MetricsRepository> option, Option<ResultKey> option2, boolean z, Option<ResultKey> option3) {
        ObjectRef create = ObjectRef.create(analysisRunBuilder);
        option.foreach(new ColumnProfiler$$anonfun$setMetricsRepositoryConfigurationIfNecessary$1(option2, z, option3, create));
        return (AnalysisRunBuilder) create.elem;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private AnalyzerContext getAnalyzerContextWithHistogramResultsForReusingIfNecessary(Option<MetricsRepository> option, Option<ResultKey> option2, Seq<String> seq) {
        ObjectRef create = ObjectRef.create(AnalyzerContext$.MODULE$.empty());
        option.foreach(new ColumnProfiler$$anonfun$getAnalyzerContextWithHistogramResultsForReusingIfNecessary$1(option2, seq, create));
        return (AnalyzerContext) create.elem;
    }

    private Map<Analyzer<?, Metric<?>>, Metric<?>> convertColumnNamesAndDistributionToHistogramWithMetric(Map<String, Distribution> map) {
        return (Map) map.map(new ColumnProfiler$$anonfun$convertColumnNamesAndDistributionToHistogramWithMetric$1(), Map$.MODULE$.canBuildFrom());
    }

    private void saveOrAppendResultsIfNecessary(AnalyzerContext analyzerContext, Option<MetricsRepository> option, Option<ResultKey> option2) {
        option.foreach(new ColumnProfiler$$anonfun$saveOrAppendResultsIfNecessary$1(analyzerContext, option2));
    }

    public Dataset<Row> castColumn(Dataset<Row> dataset, String str, DataType dataType) {
        return dataset.withColumn(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", "___CASTED"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{str})), dataset.apply(str).cast(dataType)).drop(str).withColumnRenamed(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", "___CASTED"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{str})), str);
    }

    /* JADX WARN: Type inference failed for: r0v14, types: [scala.collection.Map] */
    /* JADX WARN: Type inference failed for: r0v9, types: [scala.collection.Map] */
    private GenericColumnStatistics extractGenericStatistics(Seq<String> seq, StructType structType, AnalyzerContext analyzerContext, Map<String, Enumeration.Value> map) {
        long unboxToDouble = (long) BoxesRunTime.unboxToDouble(((IterableLike) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$1(), Iterable$.MODULE$.canBuildFrom())).mo3181head());
        Map map2 = (Map) analyzerContext.metricMap().filterNot((Function1<Analyzer<?, Metric<?>>, Object>) new ColumnProfiler$$anonfun$22(map)).collect(new ColumnProfiler$$anonfun$2(), Map$.MODULE$.canBuildFrom());
        Map map3 = (Map) analyzerContext.metricMap().filterNot((Function1<Analyzer<?, Metric<?>>, Object>) new ColumnProfiler$$anonfun$23(map)).collect(new ColumnProfiler$$anonfun$3(), Map$.MODULE$.canBuildFrom());
        Map map4 = (Map) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$4(), Map$.MODULE$.canBuildFrom());
        Map map5 = (Map) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$5(), Map$.MODULE$.canBuildFrom());
        Map map6 = (Map) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$6(), Map$.MODULE$.canBuildFrom());
        Map map7 = (Map) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$7(), Map$.MODULE$.canBuildFrom());
        return new GenericColumnStatistics(unboxToDouble, map2, Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(structType.fields()).filter(new ColumnProfiler$$anonfun$25(seq))).filterNot(new ColumnProfiler$$anonfun$26(map))).filter(new ColumnProfiler$$anonfun$27())).map(new ColumnProfiler$$anonfun$28(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).toMap(Predef$.MODULE$.$conforms()), map3, map4, map5, (Map) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$8(), Map$.MODULE$.canBuildFrom()), map6, map7, map);
    }

    private Map<String, Enumeration.Value> extractGenericStatistics$default$4() {
        return Predef$.MODULE$.Map().empty2();
    }

    private Dataset<Row> castNumericStringColumns(Seq<String> seq, Dataset<Row> dataset, GenericColumnStatistics genericColumnStatistics) {
        ObjectRef create = ObjectRef.create(dataset);
        seq.foreach(new ColumnProfiler$$anonfun$castNumericStringColumns$1(genericColumnStatistics, create));
        return (Dataset) create.elem;
    }

    private NumericColumnStatistics extractNumericStatistics(AnalyzerContext analyzerContext) {
        return new NumericColumnStatistics(((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$9(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$29())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$10(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$30())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$12(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$32())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$11(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$31())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$13(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$33())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$14(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$34())).toMap(Predef$.MODULE$.$conforms()), ((TraversableOnce) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$15(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$35())).toMap(Predef$.MODULE$.$conforms()), (Map) ((TraversableLike) ((GenericTraversableTemplate) analyzerContext.metricMap().collect(new ColumnProfiler$$anonfun$16(), Iterable$.MODULE$.canBuildFrom())).flatten2(new ColumnProfiler$$anonfun$36())).groupBy((Function1) new ColumnProfiler$$anonfun$37()).map(new ColumnProfiler$$anonfun$38(), Map$.MODULE$.canBuildFrom()));
    }

    private Seq<String> findTargetColumnsForHistograms(StructType structType, GenericColumnStatistics genericColumnStatistics, long j) {
        return ((TraversableOnce) ((TraversableLike) ((TraversableLike) genericColumnStatistics.approximateNumDistincts().filter(new ColumnProfiler$$anonfun$findTargetColumnsForHistograms$1(genericColumnStatistics, ((TraversableOnce) ((TraversableLike) structType.filter(new ColumnProfiler$$anonfun$39((Set) Predef$.MODULE$.Set().apply(Predef$.MODULE$.wrapRefArray(new DataType[]{StringType$.MODULE$, BooleanType$.MODULE$, DoubleType$.MODULE$, FloatType$.MODULE$, IntegerType$.MODULE$, LongType$.MODULE$, ShortType$.MODULE$}))))).map(new ColumnProfiler$$anonfun$40(), Seq$.MODULE$.canBuildFrom())).toSet()))).filter(new ColumnProfiler$$anonfun$findTargetColumnsForHistograms$2(j))).map(new ColumnProfiler$$anonfun$findTargetColumnsForHistograms$3(), Iterable$.MODULE$.canBuildFrom())).toSeq();
    }

    private Map<String, Distribution> computeHistograms(Dataset<Row> dataset, Seq<String> seq) {
        return ((TraversableOnce) seq.map(new ColumnProfiler$$anonfun$computeHistograms$1(RDD$.MODULE$.rddToPairRDDFunctions(dataset.rdd().flatMap(new ColumnProfiler$$anonfun$42(seq, Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(dataset.schema().fields()).map(new ColumnProfiler$$anonfun$41(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)))).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).toMap(Predef$.MODULE$.$conforms())), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(Tuple2.class), ClassTag$.MODULE$.Int(), Ordering$.MODULE$.Tuple2(Ordering$String$.MODULE$, Ordering$String$.MODULE$)).countByKey()), Seq$.MODULE$.canBuildFrom())).toMap(Predef$.MODULE$.$conforms());
    }

    public Map<String, Distribution> getHistogramsForThirdPass(Dataset<Row> dataset, Seq<String> seq, AnalyzerContext analyzerContext, boolean z, boolean z2, Option<MetricsRepository> option, Option<ResultKey> option2) {
        if (!seq.nonEmpty()) {
            if (z) {
                Predef$.MODULE$.println("### PROFILING: Skipping pass (3/3), no new histograms need to be calculated.");
            }
            return (Map) analyzerContext.metricMap().map(new ColumnProfiler$$anonfun$getHistogramsForThirdPass$2(), Map$.MODULE$.canBuildFrom());
        }
        if (z2) {
            throw new ReusingNotPossibleResultsMissingException(new StringBuilder().append((Object) "Could not find all necessary results in the MetricsRepository, the calculation of ").append((Object) new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"the histograms for these columns would be required: "})).s(Nil$.MODULE$)).append((Object) new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{seq.mkString(", ")}))).toString());
        }
        AnalyzerContext $plus$plus = new AnalyzerContext(convertColumnNamesAndDistributionToHistogramWithMetric(computeHistograms(dataset, seq))).$plus$plus(analyzerContext);
        saveOrAppendResultsIfNecessary($plus$plus, option, option2);
        return (Map) $plus$plus.metricMap().map(new ColumnProfiler$$anonfun$getHistogramsForThirdPass$1(), Map$.MODULE$.canBuildFrom());
    }

    private ColumnProfiles createProfiles(Seq<String> seq, GenericColumnStatistics genericColumnStatistics, NumericColumnStatistics numericColumnStatistics, CategoricalColumnStatistics categoricalColumnStatistics) {
        return new ColumnProfiles(((TraversableOnce) seq.map(new ColumnProfiler$$anonfun$47(genericColumnStatistics, numericColumnStatistics, categoricalColumnStatistics), Seq$.MODULE$.canBuildFrom())).toMap(Predef$.MODULE$.$conforms()), genericColumnStatistics.numRecords());
    }

    private ColumnProfiler$() {
        MODULE$ = this;
        this.DEFAULT_CARDINALITY_THRESHOLD = 120;
    }
}
