package org.datacleaner.beans.transform;

import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Close;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Transformer;
import org.datacleaner.components.categories.ImproveSuperCategory;
import org.datacleaner.components.categories.ReferenceDataCategory;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.reference.Dictionary;
import org.datacleaner.reference.DictionaryConnection;
import org.datacleaner.util.StringUtils;

@ExternalDocumentation({@ExternalDocumentation.DocumentationLink(title = "Segmenting customers on messy data", url = "https://www.youtube.com/watch?v=iy-j5s-uHz4", type = ExternalDocumentation.DocumentationType.VIDEO, version = "4.0")})
@Categorized(superCategory = ImproveSuperCategory.class, value = {ReferenceDataCategory.class})
@Named("Remove dictionary matches")
@Description("Removes any part of a string that is matched against a dictionary. Use it to standardize or prepare values, for instance by removing adjectives that make comparison of similar terms difficult.")
/* loaded from: input_file:org/datacleaner/beans/transform/RemoveDictionaryMatchesTransformer.class */
public class RemoveDictionaryMatchesTransformer implements Transformer {
    public static final String PROPERTY_DICTIONARY = "Dictionary";
    public static final String PROPERTY_COLUMN = "Column";
    public static final String OUTPUT_COLUMN_REMOVED_MATCHES = "Removed matches";

    @Inject
    @Configured(PROPERTY_DICTIONARY)
    Dictionary _dictionary;

    @Inject
    @Configured(PROPERTY_COLUMN)
    InputColumn<String> _column;

    @Inject
    @Configured
    @Description("How should the 'Removed matches' be returned? Get the removed matches as a concatenated String or as a List.")
    RemovedMatchesType _removedMatchesType;

    @Inject
    @Provided
    DataCleanerConfiguration _configuration;
    private DictionaryConnection _dictionaryConnection;
    private Map<String, Pattern> multiWordDictionaryPatterns;

    /* loaded from: input_file:org/datacleaner/beans/transform/RemoveDictionaryMatchesTransformer$RemovedMatchesType.class */
    public enum RemovedMatchesType implements HasName {
        STRING("String"),
        LIST("List");

        private final String _name;

        RemovedMatchesType(String str) {
            this._name = str;
        }

        public String getName() {
            return this._name;
        }
    }

    public RemoveDictionaryMatchesTransformer() {
        this._removedMatchesType = RemovedMatchesType.STRING;
    }

    public RemoveDictionaryMatchesTransformer(InputColumn<String> inputColumn, Dictionary dictionary, DataCleanerConfiguration dataCleanerConfiguration) {
        this();
        this._column = inputColumn;
        this._dictionary = dictionary;
        this._configuration = dataCleanerConfiguration;
    }

    public OutputColumns getOutputColumns() {
        Class[] clsArr = new Class[2];
        clsArr[0] = String.class;
        String[] strArr = {this._column.getName() + " (" + this._dictionary.getName() + " removed)", OUTPUT_COLUMN_REMOVED_MATCHES};
        switch (this._removedMatchesType) {
            case STRING:
                clsArr[1] = String.class;
                break;
            case LIST:
                clsArr[1] = List.class;
                break;
            default:
                throw new UnsupportedOperationException("Unsupported output type: " + this._removedMatchesType);
        }
        return new OutputColumns(strArr, clsArr);
    }

    @Initialize
    public void init() {
        this._dictionaryConnection = this._dictionary.openConnection(this._configuration);
        this.multiWordDictionaryPatterns = new LinkedHashMap();
        Iterator lengthSortedValues = this._dictionaryConnection.getLengthSortedValues();
        while (lengthSortedValues.hasNext()) {
            String str = (String) lengthSortedValues.next();
            if (!StringUtils.isSingleWord(str)) {
                this.multiWordDictionaryPatterns.put(str, this._dictionary.isCaseSensitive() ? Pattern.compile("\\b" + Pattern.quote(str) + "\\b") : Pattern.compile("\\b" + Pattern.quote(str.toLowerCase()) + "\\b"));
            }
        }
    }

    @Close
    public void close() {
        if (this._dictionaryConnection != null) {
            this._dictionaryConnection.close();
            this._dictionaryConnection = null;
        }
    }

    public Object[] transform(InputRow inputRow) {
        return transform((String) inputRow.getValue(this._column));
    }

    public Object[] transform(String str) {
        int start;
        int end;
        ArrayList arrayList = new ArrayList(2);
        if (!Strings.isNullOrEmpty(str)) {
            for (Map.Entry<String, Pattern> entry : this.multiWordDictionaryPatterns.entrySet()) {
                Pattern value = entry.getValue();
                Matcher matcher = this._dictionary.isCaseSensitive() ? value.matcher(str) : value.matcher(str.toLowerCase());
                while (matcher.find()) {
                    if (matcher.start() > 0 && str.charAt(matcher.start() - 1) == ' ') {
                        start = matcher.start() - 1;
                        end = matcher.end();
                    } else if (matcher.end() >= str.length() || str.charAt(matcher.end()) != ' ') {
                        start = matcher.start();
                        end = matcher.end();
                    } else {
                        start = matcher.start();
                        end = matcher.end() + 1;
                    }
                    str = str.substring(0, start) + str.substring(end);
                    arrayList.add(entry.getKey());
                }
            }
            StringBuilder sb = new StringBuilder();
            for (String str2 : StringUtils.splitOnWordBoundaries(str, true)) {
                if (!StringUtils.isSingleWord(str2)) {
                    sb.append(str2);
                } else if (this._dictionaryConnection.containsValue(str2)) {
                    arrayList.add(str2);
                } else {
                    sb.append(str2);
                }
            }
            str = sb.toString();
        }
        switch (this._removedMatchesType) {
            case STRING:
                return new String[]{str, Joiner.on(' ').join(arrayList)};
            case LIST:
                return new Object[]{str, arrayList};
            default:
                throw new UnsupportedOperationException("Unsupported output type: " + this._removedMatchesType);
        }
    }
}
