Compare commits

...

No commits in common. "main" and "master" have entirely different histories.
main ... master

14 changed files with 315 additions and 11 deletions

5
.env_sample Normal file
View File

@ -0,0 +1,5 @@
DB_HOST=hostname
DB_NAME=database_name
DB_USER=username
DB_PASSWORD=password
OPENAI_API_KEY=sk-token

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.env
*.swp
curl.sh
.vscode

View File

@ -1,9 +0,0 @@
MIT License
Copyright (c) <year> <copyright holders>
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,2 +0,0 @@
# transcribe_wavs

4
add_to_file.sh Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
#$PG -t -A -c "SELECT '## 2023-03-31' UNION ALL SELECT ' - ' ||(message->>'text') from rlarp.thirtysec where mdate >= '2023-03-31'" | fold -s -w 80 | sed -E 's/^([^ -])/ \1/;s/^(-{1,2} )/ \1/;s/^/ /'
$PG -t -A -f last_week.pg.sql >> /mnt/c/Users/ptrowbridge/hc_notes/30sec.md

2
build_new.sh Normal file
View File

@ -0,0 +1,2 @@
$PG -t -A -f dump_new.pg.sql >> /mnt/c/Users/ptrowbridge/hc_notes/transcribe.md

10
ddl.pg.sql Normal file
View File

@ -0,0 +1,10 @@
CREATE TABLE IF NOT EXISTS
rlarp.thirtysec (
filename text
,mdate date
,message jsonb
);
ALTER TABLE rlarp.thirtysec ADD PRIMARY KEY (filename);
GRANT ALL ON TABLE rlarp.thirtysec TO report;

75
dump_new.pg.sql Normal file
View File

@ -0,0 +1,75 @@
WITH
----------raw message content---------------
ext AS (
SELECT
-- t.filename
(t.mdate - ((EXTRACT(DOW FROM t.mdate) - 5 + 7) % 7) * INTERVAL '1 day')::date AS week_ending
,t.mdate
,message->>'rep' rep
,message->>'text' markdown
,to_char(row_number() OVER (partition by (t.mdate - ((EXTRACT(DOW FROM t.mdate) - 5 + 7) % 7) * INTERVAL '1 day')::date),'FM000') seq
FROM
rlarp.thirtysec t
ORDER BY
t.mdate DESC
)
----------create unqiue list of weeks-------
,wk AS (
SELECT
week_ending
,'# '||week_ending markdown
,to_char(row_number() OVER (ORDER BY week_ending DESC),'FM000') seq
FROM
ext
CROSS JOIN (VALUES ('A'),('B')) r (flag)
GROUP BY
week_ending
ORDER BY
week_ending ASC
)
,wkh AS (
SELECT
w.week_ending
,w.seq||'.'||'000' || '.' || r.flag seq
,CASE r.flag
WHEN 'A' THEN w.markdown
WHEN 'B' THEN ''
END markdown
FROM
wk w
CROSS JOIN (VALUES ('A'),('B')) r (flag)
)
--------level 2 message header----------
,msg AS (
SELECT
e.week_ending
,w.seq || '.' || e.seq || '.' || r.flag seq
,CASE r.flag
WHEN 'A' THEN '## [['|| e.rep ||']] ' || e.week_ending
WHEN 'B' THEN ''
WHEN 'C' THEN e.markdown
WHEN 'D' THEN ''
END markdown
-- ,r.flag
FROM
ext e
CROSS JOIN (VALUES ('A'),('B'),('C'),('D')) r (flag)
INNER JOIN wk w ON
w.week_ending = e.week_ending
)
,stack AS (
SELECT
week_ending
,seq
,markdown
FROM
msg
UNION ALL
SELECT
week_ending
,seq
,markdown
FROM
wkh
)
SELECT markdown FROM stack order by seq

81
dump_new_lastweek.pg.sql Normal file
View File

@ -0,0 +1,81 @@
WITH
----------raw message content---------------
ext AS (
SELECT
-- t.filename
(t.mdate - ((EXTRACT(DOW FROM t.mdate) - 5 + 7) % 7) * INTERVAL '1 day')::date AS week_ending
,t.mdate
,message->>'rep' rep
,message->>'text' markdown
,to_char(row_number() OVER (partition by (t.mdate - ((EXTRACT(DOW FROM t.mdate) - 5 + 7) % 7) * INTERVAL '1 day')::date),'FM000') seq
FROM
rlarp.thirtysec t
ORDER BY
t.mdate DESC
)
----------create unqiue list of weeks-------
,wk AS (
SELECT
week_ending
,'# '||week_ending markdown
,to_char(row_number() OVER (ORDER BY week_ending DESC),'FM000') seq
FROM
ext
CROSS JOIN (VALUES ('A'),('B')) r (flag)
GROUP BY
week_ending
ORDER BY
week_ending ASC
)
,wkh AS (
SELECT
w.week_ending
,w.seq||'.'||'000' || '.' || r.flag seq
,CASE r.flag
WHEN 'A' THEN w.markdown
WHEN 'B' THEN ''
END markdown
FROM
wk w
CROSS JOIN (VALUES ('A'),('B')) r (flag)
WHERE
w.week_ending = (select max(week_ending) from ext)
)
--------level 2 message header----------
,msg AS (
SELECT
e.week_ending
,w.seq || '.' || e.seq || '.' || r.flag seq
,CASE r.flag
WHEN 'A' THEN '## [['|| e.rep ||']] ' || e.week_ending
WHEN 'B' THEN ''
WHEN 'C' THEN e.markdown
WHEN 'D' THEN ''
END markdown
-- ,r.flag
FROM
ext e
CROSS JOIN (VALUES ('A'),('B'),('C'),('D')) r (flag)
INNER JOIN wk w ON
w.week_ending = e.week_ending
WHERE
e.week_ending = (select max(week_ending) from ext)
)
,stack AS (
SELECT
week_ending
,seq
,markdown
FROM
msg
UNION ALL
SELECT
week_ending
,seq
,markdown
FROM
wkh
)
-- select * from msg
-- select * from ext where week_ending = (select max(week_ending) From ext)
SELECT markdown FROM stack order by seq

8
last_week.pg.sql Normal file
View File

@ -0,0 +1,8 @@
SELECT
'## '||(SELECT (SELECT MAX(mdate) FROM rlarp.thirtysec)::date - ((EXTRACT(DOW FROM (SELECT MAX(mdate) FROM rlarp.thirtysec)::date) - 5 + 7) % 7) * INTERVAL '1 day' AS last_friday)
UNION ALL
SELECT '- ' ||mdate||' [[]] '||(message->>'text')
FROM
rlarp.thirtysec
WHERE
mdate >= (SELECT (SELECT MAX(mdate) FROM rlarp.thirtysec)::date - ((EXTRACT(DOW FROM (SELECT MAX(mdate) FROM rlarp.thirtysec)::date) - 5 + 7) % 7) * INTERVAL '1 day' AS last_friday)

29
map.pg.sql Normal file
View File

@ -0,0 +1,29 @@
SELECT
t.filename
,substring(message->>'text',1,50)
,message
FROM
rlarp.thirtysec t
WHERE
--substring(message->>'text',1,100) ~ 'Tony Land'
message->>'rep' IS NULL
ORDER BY filename desc
SELECT
t.filename
,substring(message->>'text',1,50)
FROM
rlarp.thirtysec t
WHERE
substring(message->>'text',1,100) ~ 'Maxwell'
AND message->>'rep' IS NULL
ORDER BY filename desc
UPDATE
rlarp.thirtysec t
SET
message = message || '{"rep":"Colin Maxwell"}'::jsonb
WHERE
substring(message->>'text',1,50) ~ 'Maxwell'
AND message->>'rep' IS NULL

19
readme.md Normal file
View File

@ -0,0 +1,19 @@
## Setup
### env file
copy env and setup credentials
```
cp .env_sample .env
```
### database ddl
run ddl against target database
```
psql -U -d -p -h -f ddl.pg.sql
```
## Usage
example:
```
python3 transcribe.py /path/to/diretory/
```

4
requirements.sh Normal file
View File

@ -0,0 +1,4 @@
sudo apt-get install libpq-dev
pip install requests
pip install psycopg2
pip install python-dotenv

74
transcribe.py Normal file
View File

@ -0,0 +1,74 @@
import requests
import argparse
import psycopg2
import json
import os
from dotenv import load_dotenv
load_dotenv()
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
openai_api_key = os.getenv('OPENAI_API_KEY')
# Set up the database connection
conn = psycopg2.connect(
host= db_host
,database= db_name
,user= db_user
,password= db_password
,connect_timeout = 120
)
# Define the API endpoint and headers
url = 'https://api.openai.com/v1/audio/translations'
headers = {
'Authorization': f'Bearer {openai_api_key}'
}
params = {
'model': 'whisper-1',
'response_format': 'vtt'
}
data = {
'model': 'whisper-1'
}
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('dir_path', help='path to directory containing audio files to transcribe')
args = parser.parse_args()
# Define the audio file to be transcribed
for file_name in os.listdir(args.dir_path):
if file_name.endswith('.wav'):
file_path = os.path.join(args.dir_path, file_name)
file_name = os.path.basename(file_path)
file_date = file_name[:10]
# Check if there is a row in the database with a matching filename
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM rlarp.thirtysec WHERE filename = %s", (file_name,))
count = cur.fetchone()[0]
cur.close()
if count > 0:
print(f"Skipping {file_name} (already processed)")
continue
# Send the transcription request and retrieve the results
print(f"to be processed {file_path}")
audio_file = open(file_path, 'rb')
response = requests.post(url, headers=headers, params=params, data=data, files={'file': audio_file})
transcript = response.text
print(response.text)
# Insert the JSON summary into the database
cur = conn.cursor()
cur.execute("INSERT INTO rlarp.thirtysec (filename, mdate, message) VALUES (%s, %s, %s);", (file_name, file_date, response.text))
conn.commit()
cur.close()
#close db connection
conn.close()