Skip to content

Commit

Permalink
dbscan clusters on story features
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Mar 29, 2024
1 parent 88a59c9 commit 59edb63
Show file tree
Hide file tree
Showing 8 changed files with 377 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/daily.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
run: |
source dots/bin/activate
python -m spacy download en_core_web_sm
python -m main -d 3 # python DOTS/feat.py -d 3
python -m main -d 0 -n 100 -f 10
env:
OS_TOKEN: ${{ secrets.OS_TOKEN }}
LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }}
2 changes: 1 addition & 1 deletion .github/workflows/on_push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
run: |
source dots/bin/activate
python -m spacy download en_core_web_sm
python DOTS/test/test_dots_feat.py
python DOTS/test/test_dots_feat.py -n 5 -f 3 -s 1 -o dots_feats.csv
env:
OS_TOKEN: ${{ secrets.OS_TOKEN }}
LOBSTR_KEY: ${{ secrets.LOBSTR_KEY }}
Expand Down
11 changes: 7 additions & 4 deletions DOTS/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,15 @@ def process_url(url):
def pull_data(articles):
# aa = [item for sublist in articles for item in sublist]
data = [list(d['_source']['metadata'].values()) for d in articles]
df = pd.DataFrame(data, columns=['date','title', 'person', 'org', 'location', 'theme', 'text', 'url'])
df.date=pd.to_datetime(df.date).dt.strftime('%d-%m-%Y')
df['locc'] = df['location'].apply(extract_location)
try:
df = pd.DataFrame(data, columns=['date','title', 'person', 'org', 'location', 'theme', 'text', 'url'])
df.date=pd.to_datetime(df.date).dt.strftime('%d-%m-%Y')
df['locc'] = df['location'].apply(extract_location)
except:
df = pd.DataFrame(data, columns=['title','id','url','title2'])
with concurrent.futures.ThreadPoolExecutor() as executor:
df['text'] = list(tqdm(executor.map(process_url, df['url']), total=len(df['url'])))
return df.values.tolist()
return df['text'].values.tolist()


def pull_lobstr_gdoc(pull=1):
Expand Down
38 changes: 38 additions & 0 deletions DOTS/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,44 @@ def get_OS_data(n):
data = json.loads(output)
return data

def get_gnews_data(n):
bash_command = f"""
curl -X GET "{os_url}/test-google-news-index/_search" -H 'Content-Type: application/json' '{{
"_source": ["metadata.link", "metadata.title"],
"size": {n},
"query": {{
"bool": {{
"must": [
{{"match_all": {{}}}}
]
}}
}}
}}'
"""
process = subprocess.run(bash_command, shell=True, capture_output=True, text=True)
output = process.stdout
data = json.loads(output)
return data

def get_test_gnews(n):
bash_command = f"""
curl -X GET "{os_url}/test-google-news-index/_search" '{{
"_source": ["metadata.link", "metadata.title"],
"size": {n},
"query": {{
"bool": {{
"must": [
{{"match_all": {{}}}}
]
}}
}}
}}'
"""
process = subprocess.run(bash_command, shell=True, capture_output=True, text=True)
output = process.stdout
data = json.loads(output)
return data


def get_massive_OS_data(t=1):
client = OpenSearch(os_url)
Expand Down
34 changes: 33 additions & 1 deletion DOTS/test/test_dots_feat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from DOTS.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news
from DOTS.pull import process_hit, process_data, pull_data
from datetime import datetime

import pandas as pd

@pytest.fixture
def get_data():
Expand Down Expand Up @@ -90,3 +90,35 @@ def test_lobstr_featurize(get_lobstr_data):
assert len(features) == 4
except:
pass
@pytest.fixture
def get_gdata():
return get_gnews_data(10)

def test_get_test_gnews_data(get_gdata):
data = get_gdata
assert len(data['hits']['hits']) == 10

def test_gnews_test(get_gdata):
rank_articles=[]
data = get_gdata
hits = response["hits"]["hits"]
articles = pull_data(hits)
try: #since some stories will be unretreatable
cc = featurize_stories(str(articles), 4, 512)
assert len(cc) == 4
rank_articles.append(cc)
except:
pass
flattened_list = [item for sublist in rank_articles for item in sublist]
data=pd.DataFrame(flattened_list) # each ranked feature is a row
data.drop_duplicates(inplace=True)

object_columns = data.select_dtypes(include=['object']).columns
data[object_columns] = data[object_columns].astype(str)

g = graphistry.nodes(data)
g2 = g.umap()
g3 = g2.dbscan()
g3.encode_point_color('_dbscan',palette=["hotpink", "dodgerblue"],as_continuous=True).plot()

assert len(g3._nodes) > max(g3._nodes._dbscan)
Loading

0 comments on commit 59edb63

Please sign in to comment.