graphistry · dcolinmorgan · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py
@@ -393,6 +393,28 @@ def convert_money_string_to_float(money: str):
     mask = where_is_currency_column(df, col)
     df[col, mask] = df[col, mask].apply(convert_money_string_to_float)
 
+def try_coerce_to_numeric(ndf: pd.DataFrame):
+    try:
+        nndf = ndf.copy()
+        object_columns = nndf.select_dtypes(include=['object']).columns
+        for j in object_columns:
+            num_floats = sum(isinstance(x, float) for x in nndf[j].dropna())
+            if num_floats > len(nndf[j]) / 2:  # most of column is float
+                try:
+                    nndf[j] = [float(value) if not isinstance(value, float) else value for value in nndf[j]]
+                    logger.info("Coerced strings to floats")
+                except:
+                    # nndf[j] = nndf[j].apply(lambda x: str(x).split() if isinstance(x, str) and ' ' in x else x)
+                    # nndf = nndf.explode(j)
+                    # logger.info("Exploded rows with multiple values in single cell")
+                    nndf[j] = nndf[j].apply(lambda x: str(x).split()[0] if isinstance(x, str) and ' ' in x else x)
+                    nndf[j] = nndf[j].astype(float)
+                    nndf.reset_index(drop=True, inplace=True)
+                    logger.info("took first float of tuple in single cell")
+
+    except:
+        pass
+    return nndf
 
 def is_dataframe_all_numeric(df: pd.DataFrame) -> bool:
     is_all_numeric = True
@@ -890,6 +912,7 @@ def process_dirty_dataframes(
     from sklearn.preprocessing import FunctionTransformer
     t = time()
 
+    ndf = try_coerce_to_numeric(ndf)
     all_numeric = is_dataframe_all_numeric(ndf)
     if not all_numeric and has_dirty_cat:
         data_encoder = SuperVectorizer(

diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py
@@ -438,6 +438,24 @@ def test_edge_scaling(self):
                                   return_scalers=True)
 
 
+    @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy_text, reason="requires ai feature dependencies")
+    def test_type_edgecase(self):
+        df = pd.DataFrame({
+            'A': np.random.rand(50),
+            'B': np.random.rand(50)
+        })
+        num_to_convert = int(len(df.A.values) * 0.1)
+        indices_to_convert = np.random.choice(len(df.A.values), num_to_convert, replace=False)
+        indices_to_convertB = np.random.choice(len(df.A.values), num_to_convert, replace=False)
+        for i,j in zip(indices_to_convert, indices_to_convertB):
+            df.A[i] = str(df.A[i])
+            df.B[j] = str(df.B[j])
+        df.A.loc[13] = '92.026 123.903 702.124'
+        df.B.loc[33] = '26.092 903.123'
+
+        graphistry.nodes(df).featurize()
+        assert True
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py
@@ -400,6 +400,19 @@ def test_umap_edgecase(self):
 
         graphistry.nodes(df).umap()
         assert True
+
+    @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
+    def test_type_edgecase(self):
+        values = pd.Series(np.random.rand(50))
+        num_to_convert = int(len(values) * 0.05)
+        indices_to_convert = np.random.choice(len(values), num_to_convert, replace=False)
+        for i in indices_to_convert:
+            values[i] = str(values[i])
+        values.loc[13] = '92.026 123.903 702.124'
+        values.loc[33] = '26.092 903.123'
+
+        graphistry.nodes(values).umap()
+        assert True
 
     @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies")
     def test_node_umap(self):