呼和浩特-哈素海-辉腾锡勒草原

河北省怀来县鸡鸣驿城

用 BigQuery 查询 GitHub 上的 'Repositories contributed to'

在 Google BigQuery 上查,不考虑免费额度,大约每次 7 美元

WITH user_name AS (SELECT 'crvv' UNION ALL SELECT 'CRVV'),
data AS (
SELECT
  type,
  repo.name AS repo_name,
  regexp_extract(repo.name, '(.*)/.*') AS repo_owner,
  coalesce(
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.html_url'),
    JSON_EXTRACT_SCALAR(payload, '$.issue.html_url')
  ) AS url,
  coalesce(
    JSON_EXTRACT_SCALAR(payload, '$.pull_request.created_at'),
    JSON_EXTRACT_SCALAR(payload, '$.issue.created_at')
  ) AS time
FROM
  `githubarchive.month.201*`
WHERE
  _TABLE_SUFFIX > '50'
  AND type IN ('IssuesEvent', 'PullRequestEvent')
  AND actor.login IN (SELECT * FROM user_name)
  AND JSON_EXTRACT_SCALAR(payload, '$.action') = 'opened'
)
SELECT * FROM data
  WHERE repo_owner NOT IN (SELECT * FROM user_name)
  ORDER BY time

北京市昌平区菩萨鹿镇

用 Leetcode #185 的几种解法对比 PostgreSQL 和 MySQL

#185 是 Leetcode 上数据库部分通过率最低的题目

CREATE TABLE

MySQL

CREATE TABLE Department (
  Id   INT AUTO_INCREMENT PRIMARY KEY,
  Name TEXT
);

CREATE TABLE Employee (
  Id           INT AUTO_INCREMENT PRIMARY KEY,
  Name         TEXT,
  Salary       INT,
  DepartmentId INT
);

PostgreSQL

CREATE TABLE "Department" (
  "Id"   SERIAL PRIMARY KEY,
  "Name" TEXT
);

CREATE TABLE "Employee" (
  "Id"           SERIAL PRIMARY KEY,
  "Name"         TEXT,
  "Salary"       INT,
  "DepartmentId" INT
);

INSERT test data

MySQL

CREATE PROCEDURE insert_department(c INT)
  BEGIN
    DECLARE i INT DEFAULT 0;
    label1: LOOP
      IF i = c THEN
        LEAVE label1;
      END IF;
      INSERT INTO Department (Name) SELECT CAST(round(rand() * 100000000) AS CHAR(8));
      SET i = i + 1;
      ITERATE label1;
    END LOOP label1;
  END;

CREATE PROCEDURE insert_employee(c INT)
  BEGIN
    DECLARE i INT DEFAULT 0;
    label1: LOOP
      IF i = c THEN
        LEAVE label1;
      END IF;
      INSERT INTO Employee (Name, Salary, DepartmentId)
        SELECT CAST(round(rand() * 100000000) AS CHAR(8)), 100000 * rand(), 10 * rand();
      SET i = i + 1;
      ITERATE label1;
    END LOOP label1;
  END;

CALL insert_department(10);
CALL insert_employee(10000);

PostgreSQL

DO $$
DECLARE
  i INT = 0;
BEGIN
  FOR i IN 1..10 LOOP
    INSERT INTO "Department" ("Name") SELECT round(random() * 100000000)::CHAR(8);
  END LOOP;
END
$$;

DO $$
DECLARE
  i INT = 0;
BEGIN
  FOR i IN 1..10000 LOOP
    INSERT INTO "Employee" ("Name", "Salary", "DepartmentId")
      SELECT CAST(round(random() * 100000000) AS CHAR(8)), 100000 * random(), 10 * random();
  END LOOP;
END
$$;

Test

从网上抄来的解法 1

MySQL

SELECT Department.Name AS Department, e1.Name AS Employee, e1.Salary
FROM Employee e1
  INNER JOIN Department ON e1.DepartmentId = Department.Id
WHERE (SELECT count(DISTINCT Employee.Salary)
       FROM Employee
       WHERE e1.DepartmentId = Employee.DepartmentId
      AND e1.Salary < Employee.Salary) < 3

30 rows retrieved starting from 1 in 14s 6ms (execution: 14s 5ms, fetching: 1ms)

PostgreSQL

SELECT "Department"."Name" AS "Department", e1."Name" AS "Employee", e1."Salary"
FROM "Employee" e1
  INNER JOIN "Department" ON e1."DepartmentId" = "Department"."Id"
WHERE (SELECT count(DISTINCT "Employee"."Salary")
       FROM "Employee"
       WHERE e1."DepartmentId" = "Employee"."DepartmentId"
             AND e1."Salary" < "Employee"."Salary") < 3
30 rows retrieved starting from 1 in 46s 576ms (execution: 46s 572ms, fetching: 4ms)

从网上抄来的解法 2

MySQL

SELECT d.Name AS Department, e.Name AS Employee, e.Salary FROM
  (SELECT Name, Salary, DepartmentId,
     @rank := IF(@pre_d = DepartmentId, @rank + (@pre_s <> Salary), 1) AS rank,
     @pre_d := DepartmentId, @pre_s := Salary
   FROM Employee, (SELECT @pre_d := -1, @pre_s := -1, @rank := 1) AS init
   ORDER BY DepartmentId, Salary DESC) e JOIN Department d ON e.DepartmentId = d.Id
WHERE e.rank <= 3 ORDER BY d.Name, e.Salary DESC;

30 rows retrieved starting from 1 in 112ms (execution: 110ms, fetching: 2ms)

PostgreSQL

我用 PostgreSQL 的 window function 写了一个类似的

SELECT "Department"."Name" AS "Department", employee."Name" AS "Employee", employee."Salary"
FROM (SELECT *, rank() OVER (PARTITION BY "DepartmentId" ORDER BY "Salary" DESC) AS "rank"
  FROM "Employee") employee
  INNER JOIN "Department" ON "DepartmentId" = "Department"."Id"
WHERE rank < 4
30 rows retrieved starting from 1 in 18ms (execution: 16ms, fetching: 2ms)

另一种可以使用索引的解法

PostgreSQL

SELECT "Department"."Name" AS "Department", employee."Name" AS "Employee", employee."Salary"
FROM "Department" CROSS JOIN LATERAL (
SELECT * FROM "Employee" WHERE "DepartmentId" = "Department"."Id" ORDER BY "Salary" DESC LIMIT 3) AS employee
30 rows retrieved starting from 1 in 50ms (execution: 48ms, fetching: 2ms)

MySQL 不支持 LATERAL,我不会写这个

增大数据量并建立索引再测试

DO $$
DECLARE
  i INT = 0;
BEGIN
  FOR i IN 10001..10000000 LOOP
    INSERT INTO "Employee" ("Name", "Salary", "DepartmentId")
      SELECT CAST(round(random() * 100000000) AS CHAR(8)), 100000 * random(), 10 * random();
  END LOOP;
END
$$;

CREATE INDEX ON "Employee" ("DepartmentId", "Salary");

SELECT "Department"."Name" AS "Department", employee."Name" AS "Employee", employee."Salary"
FROM "Department" CROSS JOIN LATERAL (
SELECT * FROM "Employee" WHERE "DepartmentId" = "Department"."Id" ORDER BY "Salary" DESC LIMIT 3) AS employee
30 rows retrieved starting from 1 in 10ms (execution: 5ms, fetching: 5ms)