Google BigQuery is a web service that lets you do interactive analysis of very massive datasets - analyzing billions of rows in seconds.
ref:
https://www.githubarchive.org/#bigquery
https://bigquery.cloud.google.com/table/githubarchive:month.201612
See also:
http://ghtorrent.org/
Show repository informations (1)
WITH repo_info AS (
SELECT repo.id AS id, repo.name AS name, JSON_EXTRACT_SCALAR(payload, '$.pull_request.base.repo.description') AS description
FROM `githubarchive.month.2017*`
-- FROM `githubarchive.year.2016`
-- FROM `githubarchive.year.*`
WHERE type = "PullRequestEvent"
)
SELECT repo_info.name, ANY_VALUE(repo_info.description) AS description
FROM repo_info
WHERE
repo_info.description IS NOT NULL AND
repo_info.description != ""
GROUP BY repo_info.name
ORDER BY repo_info.name
ref:
https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#json-functions
https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#any_value
Show repository informations (2)
WITH repo_info AS (
SELECT repo.id AS id, repo.name AS name, JSON_EXTRACT_SCALAR(payload, '$.description') AS description
FROM `githubarchive.month.201708`
WHERE type = "CreateEvent"
)
SELECT repo_info.name, ANY_VALUE(repo_info.description) AS description
FROM repo_info
WHERE
repo_info.description IS NOT NULL AND
repo_info.description != ""
GROUP BY repo_info.name
ORDER BY repo_info.name
Show repository informations (3)
SELECT name, description
FROM `ghtorrent-bq.ght_2017_04_01.projects`
WHERE
forked_from IS NULL AND
description IS NOT NULL AND
description != ""
Show starred repositories by a specific user
You must use WatchEvent
for starring a repo:
https://developer.github.com/v3/activity/events/types/#watchevent
SELECT repo.name, created_at
FROM TABLE_QUERY([githubarchive:month], 'LEFT(table_ID,4) IN ("2017","2016","2015")')
WHERE type = "WatchEvent" AND actor.login = 'vinta'
GROUP BY repo.name, created_at
ORDER BY created_at DESC
Show starred repositories per user who has 10+ starred repositories
WITH stars AS (
SELECT DISTINCT actor.login AS user, repo.name AS repo
FROM `githubarchive.month.2017*`
WHERE type="WatchEvent"
),
repositories_stars AS (
SELECT repo, COUNT(*) as c FROM stars GROUP BY repo
ORDER BY c DESC
LIMIT 1000
),
users_stars AS (
SELECT user, COUNT(*) as c FROM stars
WHERE repo IN (SELECT repo FROM repositories_stars)
GROUP BY user
HAVING c >= 10
LIMIT 10000
)
SELECT user, repo FROM stars
WHERE repo IN (SELECT repo FROM repositories_stars)
AND user IN (SELECT user FROM users_stars)
ref:
https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e