Use CodeQL CLI to parse languages

This commit is contained in:
Henry Mercer
2025-05-30 16:32:43 +01:00
parent fa0b6fff20
commit e303175b83
21 changed files with 371 additions and 294 deletions

View File

@@ -1370,3 +1370,21 @@ async function getJobRunUuidSarifOptions(codeql: CodeQL) {
? [`--sarif-run-property=jobRunUuid=${jobRunUuid}`]
: [];
}
export async function getSupportedLanguageMap(
codeql: CodeQL,
): Promise<Record<string, string>> {
const resolveResult = await codeql.betterResolveLanguages();
const supportedLanguages: Record<string, string> = {};
// Populate canonical language names
for (const extractor of Object.keys(resolveResult.extractors)) {
supportedLanguages[extractor] = extractor;
}
// Populate language aliases
if (resolveResult.aliases) {
for (const [alias, extractor] of Object.entries(resolveResult.aliases)) {
supportedLanguages[alias] = extractor;
}
}
return supportedLanguages;
}

View File

@@ -122,6 +122,14 @@ test("load empty config", async (t) => {
const languages = "javascript,python";
const codeql = setCodeQL({
async betterResolveLanguages() {
return {
extractors: {
javascript: [{ extractor_root: "" }],
python: [{ extractor_root: "" }],
},
};
},
async resolveQueries() {
return {
byLanguage: {
@@ -166,6 +174,14 @@ test("loading config saves config", async (t) => {
const logger = getRunnerLogger(true);
const codeql = setCodeQL({
async betterResolveLanguages() {
return {
extractors: {
javascript: [{ extractor_root: "" }],
python: [{ extractor_root: "" }],
},
};
},
async resolveQueries() {
return {
byLanguage: {
@@ -297,6 +313,13 @@ test("load non-existent input", async (t) => {
test("load non-empty input", async (t) => {
return await withTmpDir(async (tempDir) => {
const codeql = setCodeQL({
async betterResolveLanguages() {
return {
extractors: {
javascript: [{ extractor_root: "" }],
},
};
},
async resolveQueries() {
return {
byLanguage: {
@@ -421,6 +444,14 @@ test("Using config input and file together, config input should be used.", async
extraSearchPath: string | undefined;
}> = [];
const codeql = setCodeQL({
async betterResolveLanguages() {
return {
extractors: {
javascript: [{ extractor_root: "" }],
python: [{ extractor_root: "" }],
},
};
},
async resolveQueries(
queries: string[],
extraSearchPath: string | undefined,
@@ -454,6 +485,13 @@ test("Using config input and file together, config input should be used.", async
test("API client used when reading remote config", async (t) => {
return await withTmpDir(async (tempDir) => {
const codeql = setCodeQL({
async betterResolveLanguages() {
return {
extractors: {
javascript: [{ extractor_root: "" }],
},
};
},
async resolveQueries() {
return {
byLanguage: {
@@ -1006,7 +1044,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
[
{
name: "languages from input",
codeqlResolvedLanguages: ["javascript", "java", "python"],
languagesInput: "jAvAscript, \n jaVa",
languagesInRepository: ["SwiFt", "other"],
expectedLanguages: ["javascript", "java"],
@@ -1014,7 +1051,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "languages from github api",
codeqlResolvedLanguages: ["javascript", "java", "python"],
languagesInput: "",
languagesInRepository: [" jAvAscript\n \t", " jaVa", "SwiFt", "other"],
expectedLanguages: ["javascript", "java"],
@@ -1022,7 +1058,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "aliases from input",
codeqlResolvedLanguages: ["javascript", "csharp", "cpp", "java", "python"],
languagesInput: " typEscript\n \t, C#, c , KoTlin",
languagesInRepository: ["SwiFt", "other"],
expectedLanguages: ["javascript", "csharp", "cpp", "java"],
@@ -1030,7 +1065,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "duplicate languages from input",
codeqlResolvedLanguages: ["javascript", "java", "python"],
languagesInput: "jAvAscript, \n jaVa, kotlin, typescript",
languagesInRepository: ["SwiFt", "other"],
expectedLanguages: ["javascript", "java"],
@@ -1038,7 +1072,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "aliases from github api",
codeqlResolvedLanguages: ["javascript", "csharp", "cpp", "java", "python"],
languagesInput: "",
languagesInRepository: [" typEscript\n \t", " C#", "c", "other"],
expectedLanguages: ["javascript", "csharp", "cpp"],
@@ -1046,7 +1079,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "no languages",
codeqlResolvedLanguages: ["javascript", "java", "python"],
languagesInput: "",
languagesInRepository: [],
expectedApiCall: true,
@@ -1054,7 +1086,6 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
},
{
name: "unrecognized languages from input",
codeqlResolvedLanguages: ["javascript", "java", "python"],
languagesInput: "a, b, c, javascript",
languagesInRepository: [],
expectedApiCall: false,
@@ -1063,15 +1094,26 @@ const mockRepositoryNwo = parseRepositoryNwo("owner/repo");
].forEach((args) => {
test(`getLanguages: ${args.name}`, async (t) => {
const mockRequest = mockLanguagesInRepo(args.languagesInRepository);
const languages = args.codeqlResolvedLanguages.reduce(
(acc, lang) => ({
...acc,
[lang]: true,
}),
{},
);
const stubExtractorEntry = {
extractor_root: "",
};
const codeQL = setCodeQL({
resolveLanguages: () => Promise.resolve(languages),
betterResolveLanguages: () =>
Promise.resolve({
aliases: {
"c#": Language.csharp,
c: Language.cpp,
kotlin: Language.java,
typescript: Language.javascript,
},
extractors: {
cpp: [stubExtractorEntry],
csharp: [stubExtractorEntry],
java: [stubExtractorEntry],
javascript: [stubExtractorEntry],
python: [stubExtractorEntry],
},
}),
});
if (args.expectedLanguages) {

View File

@@ -7,10 +7,10 @@ import * as semver from "semver";
import * as api from "./api-client";
import { CachingKind, getCachingKind } from "./caching-utils";
import { CodeQL } from "./codeql";
import { CodeQL, getSupportedLanguageMap } from "./codeql";
import { shouldPerformDiffInformedAnalysis } from "./diff-informed-analysis-utils";
import { Feature, FeatureEnablement } from "./feature-flags";
import { Language, parseLanguage } from "./languages";
import { Language } from "./languages";
import { Logger } from "./logging";
import { RepositoryNwo } from "./repository";
import { downloadTrapCaches } from "./trap-caching";
@@ -271,13 +271,12 @@ export function getUnknownLanguagesError(languages: string[]): string {
}
/**
* Gets the set of languages in the current repository that are
* scannable by CodeQL.
* Gets the set of languages in the current repository.
*/
export async function getLanguagesInRepo(
export async function getRawLanguagesInRepo(
repository: RepositoryNwo,
logger: Logger,
): Promise<Language[]> {
): Promise<string[]> {
logger.debug(`GitHub repo ${repository.owner} ${repository.repo}`);
const response = await api.getApiClient().rest.repos.listLanguages({
owner: repository.owner,
@@ -285,19 +284,9 @@ export async function getLanguagesInRepo(
});
logger.debug(`Languages API response: ${JSON.stringify(response)}`);
// The GitHub API is going to return languages in order of popularity,
// When we pick a language to autobuild we want to pick the most popular traced language
// Since sets in javascript maintain insertion order, using a set here and then splatting it
// into an array gives us an array of languages ordered by popularity
const languages: Set<Language> = new Set();
for (const lang of Object.keys(response.data as Record<string, number>)) {
const parsedLang = parseLanguage(lang);
if (parsedLang !== undefined) {
languages.add(parsedLang);
}
}
return [...languages];
return Object.keys(response.data as Record<string, number>).map((language) =>
language.trim().toLowerCase(),
);
}
/**
@@ -311,7 +300,7 @@ export async function getLanguagesInRepo(
* then throw an error.
*/
export async function getLanguages(
codeQL: CodeQL,
codeql: CodeQL,
languagesInput: string | undefined,
repository: RepositoryNwo,
logger: Logger,
@@ -323,23 +312,24 @@ export async function getLanguages(
logger,
);
let languages = rawLanguages;
if (autodetected) {
const supportedLanguages = Object.keys(await codeQL.resolveLanguages());
const languageMap = await getSupportedLanguageMap(codeql);
const languagesSet = new Set<string>();
const unknownLanguages: string[] = [];
languages = languages
.map(parseLanguage)
.filter((value) => value && supportedLanguages.includes(value))
.map((value) => value as Language);
logger.info(`Automatically detected languages: ${languages.join(", ")}`);
} else {
const aliases = (await codeQL.betterResolveLanguages()).aliases;
if (aliases) {
languages = languages.map((lang) => aliases[lang] || lang);
// Make sure they are supported
for (const language of rawLanguages) {
const extractorName = languageMap[language];
if (extractorName === undefined) {
unknownLanguages.push(language);
} else {
languagesSet.add(extractorName);
}
}
logger.info(`Languages from configuration: ${languages.join(", ")}`);
const languages = Array.from(languagesSet);
if (!autodetected && unknownLanguages.length > 0) {
throw new ConfigurationError(getUnknownLanguagesError(unknownLanguages));
}
// If the languages parameter was not given and no languages were
@@ -348,25 +338,14 @@ export async function getLanguages(
throw new ConfigurationError(getNoLanguagesError());
}
// Make sure they are supported
const parsedLanguages: Language[] = [];
const unknownLanguages: string[] = [];
for (const language of languages) {
const parsedLanguage = parseLanguage(language) as Language;
if (parsedLanguage === undefined) {
unknownLanguages.push(language);
} else if (!parsedLanguages.includes(parsedLanguage)) {
parsedLanguages.push(parsedLanguage);
}
if (autodetected) {
logger.info(`Autodetected languages: ${languages.join(", ")}`);
} else {
logger.info(`Languages from configuration: ${languages.join(", ")}`);
}
// Any unknown languages here would have come directly from the input
// since we filter unknown languages coming from the GitHub API.
if (unknownLanguages.length > 0) {
throw new ConfigurationError(getUnknownLanguagesError(unknownLanguages));
}
return parsedLanguages;
// TODO: use a typealias for Language and rename Language to KnownLanguage
return languages as Language[];
}
/**
@@ -383,22 +362,24 @@ export async function getRawLanguages(
languagesInput: string | undefined,
repository: RepositoryNwo,
logger: Logger,
) {
): Promise<{
rawLanguages: string[];
autodetected: boolean;
}> {
// Obtain from action input 'languages' if set
let rawLanguages = (languagesInput || "")
const languagesFromInput = (languagesInput || "")
.split(",")
.map((x) => x.trim().toLowerCase())
.filter((x) => x.length > 0);
let autodetected: boolean;
if (rawLanguages.length) {
autodetected = false;
} else {
autodetected = true;
// Obtain all languages in the repo that can be analysed
rawLanguages = (await getLanguagesInRepo(repository, logger)) as string[];
// If the user has specified languages, use those.
if (languagesFromInput.length) {
return { rawLanguages: languagesFromInput, autodetected: false };
}
return { rawLanguages, autodetected };
// Otherwise, autodetect languages in the repository.
return {
rawLanguages: await getRawLanguagesInRepo(repository, logger),
autodetected: true,
};
}
/** Inputs required to initialize a configuration. */

View File

@@ -1,33 +0,0 @@
import test from "ava";
import { Language, parseLanguage } from "./languages";
import { setupTests } from "./testing-utils";
setupTests(test);
test("parseLanguage", async (t) => {
// Exact matches
t.deepEqual(parseLanguage("csharp"), Language.csharp);
t.deepEqual(parseLanguage("cpp"), Language.cpp);
t.deepEqual(parseLanguage("go"), Language.go);
t.deepEqual(parseLanguage("java"), Language.java);
t.deepEqual(parseLanguage("javascript"), Language.javascript);
t.deepEqual(parseLanguage("python"), Language.python);
t.deepEqual(parseLanguage("rust"), Language.rust);
// Aliases
t.deepEqual(parseLanguage("c"), Language.cpp);
t.deepEqual(parseLanguage("c++"), Language.cpp);
t.deepEqual(parseLanguage("c#"), Language.csharp);
t.deepEqual(parseLanguage("kotlin"), Language.java);
t.deepEqual(parseLanguage("typescript"), Language.javascript);
// spaces and case-insensitivity
t.deepEqual(parseLanguage(" \t\nCsHaRp\t\t"), Language.csharp);
t.deepEqual(parseLanguage(" \t\nkOtLin\t\t"), Language.java);
// Not matches
t.deepEqual(parseLanguage("foo"), undefined);
t.deepEqual(parseLanguage(" "), undefined);
t.deepEqual(parseLanguage(""), undefined);
});

View File

@@ -15,41 +15,3 @@ export enum Language {
rust = "rust",
swift = "swift",
}
// Additional names for languages
export const LANGUAGE_ALIASES: { [lang: string]: Language } = {
c: Language.cpp,
"c++": Language.cpp,
"c#": Language.csharp,
kotlin: Language.java,
typescript: Language.javascript,
"javascript-typescript": Language.javascript,
"java-kotlin": Language.java,
};
/**
* Translate from user input or GitHub's API names for languages to CodeQL's
* names for languages.
*
* @param language The language to translate.
* @returns A language supported by CodeQL, an alias for a language, or
* `undefined` if the input language cannot be parsed into a language supported
* by CodeQL.
*/
export function parseLanguage(language: string): Language | undefined {
// Normalise to lower case
language = language.trim().toLowerCase();
// See if it's an exact match
if (language in Language) {
return language as Language;
}
// Check language aliases, but return the original language name,
// the alias will be resolved later.
if (language in LANGUAGE_ALIASES) {
return LANGUAGE_ALIASES[language];
}
return undefined;
}

View File

@@ -1,7 +1,9 @@
import test from "ava";
import { Language } from "./languages";
import { getRunnerLogger } from "./logging";
import * as startProxyExports from "./start-proxy";
import { parseLanguage } from "./start-proxy";
import { setupTests } from "./testing-utils";
setupTests(test);
@@ -113,3 +115,30 @@ test("getCredentials throws an error when non-printable characters are used", as
);
}
});
test("parseLanguage", async (t) => {
// Exact matches
t.deepEqual(parseLanguage("csharp"), Language.csharp);
t.deepEqual(parseLanguage("cpp"), Language.cpp);
t.deepEqual(parseLanguage("go"), Language.go);
t.deepEqual(parseLanguage("java"), Language.java);
t.deepEqual(parseLanguage("javascript"), Language.javascript);
t.deepEqual(parseLanguage("python"), Language.python);
t.deepEqual(parseLanguage("rust"), Language.rust);
// Aliases
t.deepEqual(parseLanguage("c"), Language.cpp);
t.deepEqual(parseLanguage("c++"), Language.cpp);
t.deepEqual(parseLanguage("c#"), Language.csharp);
t.deepEqual(parseLanguage("kotlin"), Language.java);
t.deepEqual(parseLanguage("typescript"), Language.javascript);
// spaces and case-insensitivity
t.deepEqual(parseLanguage(" \t\nCsHaRp\t\t"), Language.csharp);
t.deepEqual(parseLanguage(" \t\nkOtLin\t\t"), Language.java);
// Not matches
t.deepEqual(parseLanguage("foo"), undefined);
t.deepEqual(parseLanguage(" "), undefined);
t.deepEqual(parseLanguage(""), undefined);
});

View File

@@ -1,4 +1,4 @@
import { parseLanguage, Language } from "./languages";
import { Language } from "./languages";
import { Logger } from "./logging";
import { ConfigurationError } from "./util";
@@ -11,6 +11,49 @@ export type Credential = {
token?: string;
};
/*
* Language aliases supported by the start-proxy Action.
*
* In general, the CodeQL CLI is the source of truth for language aliases, and to
* allow us to more easily support new languages, we want to avoid hardcoding these
* aliases in the Action itself. However this is difficult to do in the start-proxy
* Action since this Action does not use CodeQL, so we're accepting some hardcoding
* for this Action.
*/
const LANGUAGE_ALIASES: { [lang: string]: Language } = {
c: Language.cpp,
"c++": Language.cpp,
"c#": Language.csharp,
kotlin: Language.java,
typescript: Language.javascript,
"javascript-typescript": Language.javascript,
"java-kotlin": Language.java,
};
/**
* Parse the start-proxy language input into its canonical CodeQL language name.
*
* Exported for testing, do not use this outside of the start-proxy Action
* (see the `LANGUAGE_ALIASES` docstring for more info).
*/
export function parseLanguage(language: string): Language | undefined {
// Normalize to lower case
language = language.trim().toLowerCase();
// See if it's an exact match
if (language in Language) {
return language as Language;
}
// Check language aliases, but return the original language name,
// the alias will be resolved later.
if (language in LANGUAGE_ALIASES) {
return LANGUAGE_ALIASES[language];
}
return undefined;
}
const LANGUAGE_TO_REGISTRY_TYPE: Partial<Record<Language, string>> = {
java: "maven_repository",
csharp: "nuget_feed",