[docs]deftrain(train_df:Union[str,Path,pd.DataFrame],test_df:Union[str,Path,pd.DataFrame],text_col:str=SoftSearch2022DatasetFields.abstract_text,label_col:str=SoftSearch2022DatasetFields.label,model_storage_path:Union[str,Path]=DEFAULT_SOFT_SEARCH_SEMANTIC_LOGIT_PATH,)->Tuple[Path,Pipeline,EvaluationMetrics]:# Handle storage dirmodel_storage_path=Path(model_storage_path).resolve()# Read DataFrameifisinstance(train_df,(str,Path)):train_df=pd.read_csv(train_df)# Read DataFrameifisinstance(test_df,(str,Path)):test_df=pd.read_csv(test_df)# Build the pipelinepipeline=make_pipeline(SentenceEncoder(DEFAULT_SEMANTIC_EMBEDDING_MODEL),LogisticRegressionCV(max_iter=10000),)# Fit the pipelinepipeline.fit(train_df[text_col],train_df[label_col])# Save the pipelinewithopen(model_storage_path,"wb")asopen_f:pickle.dump(pipeline,open_f)# Evalpreds=pipeline.predict(test_df[text_col])pre,rec,f1,_=precision_recall_fscore_support(test_df[label_col],preds,average="weighted",)acc=accuracy_score(test_df[label_col],preds)return(model_storage_path,pipeline,EvaluationMetrics(model="semantic-logit",precision=pre,recall=rec,f1=f1,accuracy=acc,),)