dbscan clusters on story features

graphistry · Mar 29, 2024 · 59edb63 · 59edb63
1 parent 88a59c9
commit 59edb63
Show file tree

Hide file tree

Showing 8 changed files with 377 additions and 15 deletions.
diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
@@ -47,7 +47,7 @@ jobs:
       run: |
           source dots/bin/activate
           python -m spacy download en_core_web_sm
-          python -m main -d 3  # python DOTS/feat.py -d 3
+          python -m main -d 0 -n 100 -f 10
       env:
           OS_TOKEN: ${{ secrets.OS_TOKEN }}
           LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }}
diff --git a/.github/workflows/on_push.yml b/.github/workflows/on_push.yml
@@ -34,7 +34,7 @@ jobs:
       run: |
           source dots/bin/activate
           python -m spacy download en_core_web_sm
-          python DOTS/test/test_dots_feat.py
+          python DOTS/test/test_dots_feat.py -n 5 -f 3 -s 1 -o dots_feats.csv
       env:
           OS_TOKEN: ${{ secrets.OS_TOKEN }}
           LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }}

diff --git a/DOTS/pull.py b/DOTS/pull.py
@@ -152,12 +152,15 @@ def process_url(url):
 def pull_data(articles):
     # aa = [item for sublist in articles for item in sublist]
     data = [list(d['_source']['metadata'].values()) for d in articles]
-    df = pd.DataFrame(data, columns=['date','title', 'person', 'org', 'location', 'theme', 'text', 'url'])
-    df.date=pd.to_datetime(df.date).dt.strftime('%d-%m-%Y')
-    df['locc'] = df['location'].apply(extract_location)
+    try:
+        df = pd.DataFrame(data, columns=['date','title', 'person', 'org', 'location', 'theme', 'text', 'url'])
+        df.date=pd.to_datetime(df.date).dt.strftime('%d-%m-%Y')
+        df['locc'] = df['location'].apply(extract_location)
+    except:
+        df = pd.DataFrame(data, columns=['title','id','url','title2'])
     with concurrent.futures.ThreadPoolExecutor() as executor:
         df['text'] = list(tqdm(executor.map(process_url, df['url']), total=len(df['url'])))
-    return df.values.tolist()
+    return df['text'].values.tolist()
 
 
 def pull_lobstr_gdoc(pull=1):

diff --git a/DOTS/scrape.py b/DOTS/scrape.py
@@ -29,6 +29,44 @@ def get_OS_data(n):
     data = json.loads(output)
     return data
 
+def get_gnews_data(n):
+    bash_command = f"""
+    curl -X GET "{os_url}/test-google-news-index/_search" -H 'Content-Type: application/json' '{{
+    "_source": ["metadata.link", "metadata.title"],
+        "size": {n},
+        "query": {{
+            "bool": {{
+                "must": [
+                    {{"match_all": {{}}}}
+                ]
+            }}
+        }}
+    }}'
+    """
+    process = subprocess.run(bash_command, shell=True, capture_output=True, text=True)
+    output = process.stdout
+    data = json.loads(output)
+    return data
+
+def get_test_gnews(n):
+    bash_command = f"""
+    curl -X GET "{os_url}/test-google-news-index/_search" '{{
+    "_source": ["metadata.link", "metadata.title"],
+        "size": {n},
+        "query": {{
+            "bool": {{
+                "must": [
+                    {{"match_all": {{}}}}
+                ]
+            }}
+        }}
+    }}'
+    """
+    process = subprocess.run(bash_command, shell=True, capture_output=True, text=True)
+    output = process.stdout
+    data = json.loads(output)
+    return data
+
 
 def get_massive_OS_data(t=1):
     client = OpenSearch(os_url)

diff --git a/DOTS/test/test_dots_feat.py b/DOTS/test/test_dots_feat.py
@@ -3,7 +3,7 @@
 from DOTS.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news
 from DOTS.pull import process_hit, process_data, pull_data
 from datetime import datetime
-
+import pandas as pd
 
 @pytest.fixture
 def get_data():
@@ -90,3 +90,35 @@ def test_lobstr_featurize(get_lobstr_data):
         assert len(features) == 4
     except:
         pass
+@pytest.fixture
+def get_gdata():
+    return get_gnews_data(10)
+
+def test_get_test_gnews_data(get_gdata):
+    data = get_gdata
+    assert len(data['hits']['hits']) == 10
+
+def test_gnews_test(get_gdata):
+    rank_articles=[]
+    data = get_gdata
+    hits = response["hits"]["hits"]
+    articles = pull_data(hits)
+    try:  #since some stories will be unretreatable
+        cc = featurize_stories(str(articles), 4, 512)
+        assert len(cc) == 4
+        rank_articles.append(cc)
+    except:
+        pass
+    flattened_list = [item for sublist in rank_articles for item in sublist]
+    data=pd.DataFrame(flattened_list)  # each ranked feature is a row
+    data.drop_duplicates(inplace=True)
+
+    object_columns = data.select_dtypes(include=['object']).columns
+    data[object_columns] = data[object_columns].astype(str)
+
+    g = graphistry.nodes(data)
+    g2 = g.umap()
+    g3 = g2.dbscan()
+    g3.encode_point_color('_dbscan',palette=["hotpink", "dodgerblue"],as_continuous=True).plot()
+
+    assert len(g3._nodes) > max(g3._nodes._dbscan)