{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":653447251,"defaultBranch":"main","name":"llm-jp-corpus","ownerLogin":"llm-jp","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-06-14T04:32:03.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/134031702?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1706749212.0","currentOid":""},"activityList":{"items":[{"before":"00cbf062479a1c4835b49cc242738ddb296e7018","after":"6e7746d254bf5add15ee977e80bc508478a8fb6e","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-02T03:07:50.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"update README","shortMessageHtmlLink":"update README"}},{"before":"6b08fe8bfe370d03be4c94c9a27d2612988e4742","after":"00cbf062479a1c4835b49cc242738ddb296e7018","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T03:09:42.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"update extract_ids.py","shortMessageHtmlLink":"update extract_ids.py"}},{"before":"2df84492577acd7c03b36c2cce64c67811095f55","after":"6b08fe8bfe370d03be4c94c9a27d2612988e4742","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T03:07:09.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix split_data.py","shortMessageHtmlLink":"fix split_data.py"}},{"before":"05425b7c21c1755ec4c0bf61ec15375ba38049e8","after":"2df84492577acd7c03b36c2cce64c67811095f55","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T03:03:06.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix split_data.py","shortMessageHtmlLink":"fix split_data.py"}},{"before":"19587c4261292638fa43a6565cabfa7d3923ceb5","after":"05425b7c21c1755ec4c0bf61ec15375ba38049e8","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T02:48:00.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix","shortMessageHtmlLink":"fix"}},{"before":"23df11020b4a28208b59501d5ba10f8c4583ef79","after":"19587c4261292638fa43a6565cabfa7d3923ceb5","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T02:47:19.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"update filter_data.py","shortMessageHtmlLink":"update filter_data.py"}},{"before":"94168115450f3bb318873a4981a92f435540b2cb","after":"23df11020b4a28208b59501d5ba10f8c4583ef79","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T02:06:10.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix","shortMessageHtmlLink":"fix"}},{"before":"c3d9c6591fd029caeb0521923861b44d28612154","after":"94168115450f3bb318873a4981a92f435540b2cb","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T02:05:16.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix","shortMessageHtmlLink":"fix"}},{"before":"bb14fc5526f0314c473059e9ef9a8364290af5fb","after":"c3d9c6591fd029caeb0521923861b44d28612154","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T01:15:12.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"download zh_wiki and ko_wiki","shortMessageHtmlLink":"download zh_wiki and ko_wiki"}},{"before":null,"after":"bb14fc5526f0314c473059e9ef9a8364290af5fb","ref":"refs/heads/v2.0.0","pushedAt":"2024-02-01T01:00:12.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"add filtering rule for slimpajama","shortMessageHtmlLink":"add filtering rule for slimpajama"}},{"before":null,"after":"ea0681c213688171f424709895cc8ddaa6403db4","ref":"refs/heads/v1.2.0","pushedAt":"2023-10-27T00:43:14.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"download oscar","shortMessageHtmlLink":"download oscar"}},{"before":"fe6642e54246283868303bb6f971aba7ad4f1c59","after":"9815fc53b6be5a6de714fb2858f23a011358f05e","ref":"refs/heads/main","pushedAt":"2023-10-23T01:04:42.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"remoev nsfw_words","shortMessageHtmlLink":"remoev nsfw_words"}},{"before":"3638abdd5122d42108e74e68f596cf55386d6207","after":"fe6642e54246283868303bb6f971aba7ad4f1c59","ref":"refs/heads/main","pushedAt":"2023-10-23T01:02:02.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"V1.0.1 (#37)\n\n* update wikipedia dumps\r\n\r\n* refactoring\r\n\r\n* tweak\r\n\r\n* remove unused columns\r\n\r\n* tweak\r\n\r\n* rename filter_and_reformat_data.py to filter_data.py\r\n\r\n* refactoring\r\n\r\n* remove wikipedia footnote\r\n\r\n* tweak\r\n\r\n* remove empty parantheses in wikipedia\r\n\r\n* tweak\r\n\r\n* tweak\r\n\r\n* filter stack by extension\r\n\r\n* introduce line-length based filtering to stack\r\n\r\n* tweak\r\n\r\n* fix\r\n\r\n* tweak\r\n\r\n* add ethical filtering to japanese cc\r\n\r\n* include token_ids\r\n\r\n* shuffle file orders when sampling\r\n\r\n* [WIP] create a validation split\r\n\r\n* create a validation splitgws\r\n\r\n* tweak\r\n\r\n* add Makefile\r\n\r\n* add .gitignore\r\n\r\n* tokenize only once for one example\r\n\r\n* change the timestamp of wikipedia dump to 20230720\r\n\r\n* enable to specify num_procs in tokenize_data.py\r\n\r\n* fix a bug\r\n\r\n* add num_tokens field to tokenized datasets\r\n\r\n* enable to specify num_procs in count_tokens.py\r\n\r\n* tweak\r\n\r\n* fix tokenize_data.py for more flexible input\r\n\r\n* update README.md\r\n\r\n* fix convert_parquet_to_jsonl.py for more flexible input\r\n\r\n* fix count_tokens.py for more flexible input\r\n\r\n* fix count_tokens.py\r\n\r\n* fix sample_data.py\r\n\r\n* use batched tokenization of sentencepiece\r\n\r\n* add sample target for make\r\n\r\n* refactor\r\n\r\n* tweak\r\n\r\n* add logging message\r\n\r\n* add token count for v1.0.1 dataset\r\n\r\n* parallelise data loading\r\n\r\n* fix sample_data.py\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix\r\n\r\n* fix sample_data.py\r\n\r\n* fix sample_data.py\r\n\r\n* increase default valid token size\r\n\r\n* tweak\r\n\r\n* fix sample_data.py\r\n\r\n* fix a bug\r\n\r\n* fix sample_data.py\r\n\r\n* update README.md\r\n\r\n* refactor\r\n\r\n* add --input_format option to count_tokens.py\r\n\r\n* fix count_tokens.py\r\n\r\n* refactor tokenize_data.py\r\n\r\n* add ethical filtering to japanese wikipedia\r\n\r\n* fix a bug\r\n\r\n* add split_data.py\r\n\r\n* fix Makefile\r\n\r\n* fix split_data.py\r\n\r\n* tweak\r\n\r\n* tweak\r\n\r\n* tweak\r\n\r\n* support jsonl input format in tokenize_data.py\r\n\r\n* fix nsfw words\r\n\r\n* tweak\r\n\r\n* update pre-commit hooks\r\n\r\n* add extract_ids.py\r\n\r\n* add split_data_by_id.py and update README.md\r\n\r\n---------\r\n\r\nCo-authored-by: Hirokazu Kiyomaru ","shortMessageHtmlLink":"V1.0.1 (#37)"}},{"before":"eedecb5631e09dd65b8c4d158a9f598848f71e78","after":"3638abdd5122d42108e74e68f596cf55386d6207","ref":"refs/heads/main","pushedAt":"2023-10-17T01:01:11.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix","shortMessageHtmlLink":"fix"}},{"before":"29ecfcffd48b6a38397cdfeb566eaa942712074a","after":"b0b59cb9888ff8500274ceb04e0940fb617105e8","ref":"refs/heads/v1.0.1","pushedAt":"2023-09-27T02:28:26.000Z","pushType":"push","commitsCount":8,"pusher":{"login":"nobu-g","name":"Nobuhiro Ueda","path":"/nobu-g","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/25974220?s=80&v=4"},"commit":{"message":"Merge branch 'main' into v1.0.1\n\n# Conflicts:\n#\tscripts/README.md\n#\tscripts/filter_data.py\n#\tscripts/filters.py\n#\tscripts/split_data.py\n#\tscripts/tokenize_data.py","shortMessageHtmlLink":"Merge branch 'main' into v1.0.1"}},{"before":"08c1bd7eeb8df1343432c6b223362e7c1e7ef1c9","after":"29ecfcffd48b6a38397cdfeb566eaa942712074a","ref":"refs/heads/v1.0.1","pushedAt":"2023-09-27T02:16:57.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"nobu-g","name":"Nobuhiro Ueda","path":"/nobu-g","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/25974220?s=80&v=4"},"commit":{"message":"add split_data_by_id.py and update README.md","shortMessageHtmlLink":"add split_data_by_id.py and update README.md"}},{"before":"c58e94fe899104c310185c01d8d160f27fe352af","after":"eedecb5631e09dd65b8c4d158a9f598848f71e78","ref":"refs/heads/main","pushedAt":"2023-09-20T02:24:41.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"exclude the Books3 portion in the Pile dataset","shortMessageHtmlLink":"exclude the Books3 portion in the Pile dataset"}},{"before":"15870f51db1cdcfcee32bf3d71ff4885bcbd7a27","after":"c58e94fe899104c310185c01d8d160f27fe352af","ref":"refs/heads/main","pushedAt":"2023-09-20T00:12:06.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"Improve the filtering for Japanese mC4 (#35)\n\n* add default argument values of has_good_compression_ratio\r\n\r\n* introduce compression ratio-based filtering\r\n\r\n* add hojichar\r\n\r\n* introduce is_japanese()\r\n\r\n* add is_adult_content()\r\n\r\n* add is_discrimination_content()\r\n\r\n* add is_violence_content()\r\n\r\n* tweak\r\n\r\n* update evaluation script\r\n\r\n* add is_ad_content()\r\n\r\n* refactoring\r\n\r\n* manage valid domains as a separate file\r\n\r\n* manage valid extensions as a separate file\r\n\r\n* refactoring\r\n\r\n* use total ng word count rather than total unique ng word count\r\n\r\n* add has_good_average_sentence_length()\r\n\r\n* uniq japanese valid domains\r\n\r\n* add strict option to filter_data.py\r\n\r\n* fix evaluation script\r\n\r\n* move requirements files to root","shortMessageHtmlLink":"Improve the filtering for Japanese mC4 (#35)"}},{"before":"48a061abeb24ab3198196ea9c7096fe969a3f5f6","after":"b3af3233dce075afeab28e1ad32e20d9a9b23cc3","ref":"refs/heads/improve-ja-cc-filtering","pushedAt":"2023-09-12T03:38:58.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"move requirements files to root","shortMessageHtmlLink":"move requirements files to root"}},{"before":"e3563761ff4b9a6ab78ccfcedb5b8f90b122209a","after":"48a061abeb24ab3198196ea9c7096fe969a3f5f6","ref":"refs/heads/improve-ja-cc-filtering","pushedAt":"2023-09-12T01:17:37.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"fix evaluation script","shortMessageHtmlLink":"fix evaluation script"}},{"before":"c225c35d323f8a9991981bb158aca3ae03f3c36c","after":"15870f51db1cdcfcee32bf3d71ff4885bcbd7a27","ref":"refs/heads/main","pushedAt":"2023-09-09T15:00:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"shuheikurita","name":"Shuhei Kurita","path":"/shuheikurita","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/35558937?s=80&v=4"},"commit":{"message":"Add pandas-jsonl to tokenize_data.py\n\nhttps://github.com/huggingface/datasets/issues/5531","shortMessageHtmlLink":"Add pandas-jsonl to tokenize_data.py"}},{"before":"9109d46fbcd22bcf850f517454bc120dff918bae","after":"e3563761ff4b9a6ab78ccfcedb5b8f90b122209a","ref":"refs/heads/improve-ja-cc-filtering","pushedAt":"2023-09-07T07:45:37.000Z","pushType":"push","commitsCount":3,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"add strict option to filter_data.py","shortMessageHtmlLink":"add strict option to filter_data.py"}},{"before":"e2fe9d89606b6c151a8d51712528781dbdbc413c","after":"9109d46fbcd22bcf850f517454bc120dff918bae","ref":"refs/heads/improve-ja-cc-filtering","pushedAt":"2023-09-06T03:21:39.000Z","pushType":"push","commitsCount":7,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"use total ng word count rather than total unique ng word count","shortMessageHtmlLink":"use total ng word count rather than total unique ng word count"}},{"before":null,"after":"e2fe9d89606b6c151a8d51712528781dbdbc413c","ref":"refs/heads/improve-ja-cc-filtering","pushedAt":"2023-09-05T02:47:30.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"tweak","shortMessageHtmlLink":"tweak"}},{"before":"a2a7a3ad96de9571fa461b0d57b7cc3f5d2a504b","after":"c225c35d323f8a9991981bb158aca3ae03f3c36c","ref":"refs/heads/main","pushedAt":"2023-09-04T10:48:35.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"Update evaluate_filtering.py","shortMessageHtmlLink":"Update evaluate_filtering.py"}},{"before":"9b955ee5ef9d13ac5f9ca5f28bcb0d5f7aa0cc1b","after":"a2a7a3ad96de9571fa461b0d57b7cc3f5d2a504b","ref":"refs/heads/main","pushedAt":"2023-09-04T10:16:49.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"Add a benchmarking script (#34)\n\n* add ja-mc4.valid.labeled.jsonl\r\n\r\n* add evaluate.py\r\n\r\n* update README\r\n\r\n* add benchmark/README","shortMessageHtmlLink":"Add a benchmarking script (#34)"}},{"before":"569208b694f98054f440eec950fff0986972c222","after":"f24590fbe0b8bb2e6ec33a636398f512f45899b3","ref":"refs/heads/add-benchmark","pushedAt":"2023-09-04T10:12:02.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"add benchmark/README","shortMessageHtmlLink":"add benchmark/README"}},{"before":"ff49bdf871baecfb381f5fb895e86eb26e2c34a2","after":"569208b694f98054f440eec950fff0986972c222","ref":"refs/heads/add-benchmark","pushedAt":"2023-09-04T10:07:06.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"update README","shortMessageHtmlLink":"update README"}},{"before":null,"after":"ff49bdf871baecfb381f5fb895e86eb26e2c34a2","ref":"refs/heads/add-benchmark","pushedAt":"2023-09-04T10:05:56.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"hkiyomaru","name":"Hirokazu Kiyomaru","path":"/hkiyomaru","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/13678589?s=80&v=4"},"commit":{"message":"add evaluate.py","shortMessageHtmlLink":"add evaluate.py"}},{"before":"2be08c309147ceb72675367c6bd4bab215505f97","after":null,"ref":"refs/heads/compression-based-filter","pushedAt":"2023-09-04T02:05:21.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"odashi","name":"Yusuke Oda","path":"/odashi","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/1023695?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAD78mUMwA","startCursor":null,"endCursor":null}},"title":"Activity ยท llm-jp/llm-jp-corpus"}