-
-
Save danmaas/a05b38a01163899ca89fe9eef806d68e to your computer and use it in GitHub Desktop.
Scrapes over dockerhub, grabbing the pull counts of all images, the last time "latest" tag was updated, and gets the associated "FROM" image for each container.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################################################### | |
# | |
# Copyright (c) Microsoft Corporation. All rights reserved. | |
# | |
######################################################################################### | |
Microsoft.PowerShell.Core\Set-StrictMode -Version Latest | |
#region variables | |
$urlDockerfile = "https://store.docker.com/api/content/v1/products/search" | |
$urlTags = "https://store.docker.com/v2/repositories/" | |
$dockerFileDir = (Get-Item -Path ".\" -Verbose).FullName | |
$dockerFiles = @() | |
#endregion variables | |
Write-Output "Scraping Dockerhub via Docker store..." | |
#First we gather all the images from the certified docker store area... | |
[string]$QueryString = 'page_size=100&q=&source=verified&type=image%2Cbundle' | |
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile | |
$UriBuilder.Query = $QueryString | |
$BaseUri = $UriBuilder.Uri | |
$HttpContent = Invoke-WebRequest -Uri $BaseUri | |
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json | |
#This will be the total number of dockerfiles in dockerhub store area | |
$numDockerfiles = $PageResponseContent.count | |
#For each hit to dockerhub API, we can fetch 100 items. | |
$itemsPerPage = 100 | |
$iterations = [int]($numDockerfiles / $itemsPerPage) | |
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) { | |
#Get the page results from dockerhub API | |
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())" | |
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop | |
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json | |
$items = $PageResponseContent.summaries | |
Foreach($_item in $items) | |
{ | |
$PageContentUri = "https://store.docker.com/api/content/v1/products/images/" + $_item.slug | |
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop | |
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json | |
#This grabs all container images in the store which also reside in dockerhub. | |
$repo = $PageResponseContent.plans[0].repositories[0] | |
if($repo.namespace -ne "store") | |
{ | |
$repoName = $repo.namespace + "/" + $repo.reponame | |
$PageContentUri = "https://hub.docker.com/v2/repositories/" + $repoName + "/tags/?page=1&page_size=250" | |
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop | |
$tags = ($PageResponse.Content | ConvertFrom-Json).results | |
$latestDate = "" | |
$fromBaseImage = "" | |
#Get the date for the the 'latest' tag was updated | |
Foreach($_tag in $tags) | |
{ | |
if($_tag.name -eq "latest") | |
{ | |
$latestDate = $_tag.last_updated | |
break; | |
} | |
} | |
#Save the result | |
$dockerfileObj = New-Object System.Object | |
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $repoName | |
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity | |
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at | |
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate | |
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage | |
$dockerFiles += $dockerfileObj | |
} | |
} | |
} | |
Write-Output "Finished processing 'store' images" | |
#Write the outputted array of dockerfiles to file. | |
$fileDestination = $dockerFileDir + "\out.csv" | |
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation | |
#Now we're going to gather all of the images from "community" | |
#Build the query string to capture total number of pages to click through | |
[string]$QueryString = 'page_size=100&q=%2B&source=community&type=image%2Cbundle' | |
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile | |
$UriBuilder.Query = $QueryString | |
$BaseUri = $UriBuilder.Uri | |
$HttpContent = Invoke-WebRequest -Uri $BaseUri | |
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json | |
#This will be the total number of dockerfiles in dockerhub | |
$numDockerfiles = $PageResponseContent.count | |
#For each hit to dockerhub API, we can fetch 100 items. | |
$itemsPerPage = 100 | |
$iterations = [int]($numDockerfiles / $itemsPerPage) | |
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) { | |
Write-Output "Processing page $PageNumber" | |
#Get the page results from dockerhub API | |
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())" | |
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop | |
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json | |
$items = $PageResponseContent.summaries | |
Foreach($_item in $items) | |
{ | |
#Next we need to fetch the "latest" tag and see when it was last updated... | |
[string]$QueryString = 'page_size=100&page=1' | |
$tempAddr = $urlTags + $_item.name + "/tags/" | |
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr | |
$UriBuilder.Query = $QueryString | |
$TagBaseUri = $UriBuilder.Uri | |
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri | |
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json | |
$tagsArray = $PageResponseContent.results | |
$latestDate = "" | |
#Get the latest date | |
For($j=0; $j -lt $tagsArray.Count; $j++) | |
{ | |
if($tagsArray[$j].name -eq "latest") | |
{ | |
$latestDate = $tagsArray[$j].last_updated | |
break; | |
} | |
} | |
#Get the base image that this dockerfile uses... | |
$fromBaseImage = ""; | |
$tempAddr = $urlTags + $_item.name + "/dockerfile/" | |
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr | |
$TagBaseUri = $UriBuilder.Uri | |
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri | |
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json | |
$fileLines = $PageResponseContent.contents -split '\r?\n' | |
for ($i = 0; $i -lt $fileLines.Count; $i++) | |
{ | |
$_line = $fileLines[$i] | |
#We've found a command block | |
if($_line.StartsWith("FROM")) | |
{ | |
$fromBaseImage = $_line.TrimStart("FROM").TrimStart().TrimEnd() | |
break; | |
} | |
} | |
#Make a new object representing the dockerfile and grab the values we want | |
$dockerfileObj = New-Object System.Object | |
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $_item.name | |
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity | |
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at | |
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate | |
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage | |
$dockerFiles += $dockerfileObj | |
} | |
#Write the outputted array of dockerfiles to file. | |
$fileDestination = $dockerFileDir + "\out.csv" | |
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation | |
} | |
Write-Output "Scraping Complete!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment